xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 3835318cc3]
2026-05-18 12:00:07 +00:00 · 2021-07-01 14:33:00 -05:00
parent 817b2a47c6
commit 67dcc552b6
54 changed files with 9813 additions and 245 deletions
--- a/driver/conv_bwd_data_driver.cpp
+++ b/driver/conv_bwd_data_driver.cpp
@@ -19,7 +19,22 @@ int main(int argc, char* argv[])
 {
    using namespace launcher;

-#if 0
+#if 1
+    // 1x1 filter, 14x14 image
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 1;
+    constexpr index_t WI = 128;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;
    constexpr index_t HI = 56;
@@ -93,7 +108,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 1x1 filter, 14x14 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
@@ -153,7 +168,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<2, 2>;
    using RightPads = Sequence<2, 2>;
-#elif 1
+#elif 0
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
@@ -245,7 +260,7 @@ int main(int argc, char* argv[])
    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
 #elif 0
    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
-#elif 1
+#elif 0
    device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
 #elif 1
    device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
--- a/driver/conv_bwd_data_driver_v2.cpp
+++ b/driver/conv_bwd_data_driver_v2.cpp
@@ -0,0 +1,345 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "conv_common.hpp"
+#include "host_conv_bwd_data.hpp"
+#include "device_tensor.hpp"
+#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
+
+#define USE_DYNAMIC_MODE 1
+#define USE_CONV_BWD_V4R1_XDL_NHWC 1
+#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
+
+enum ConvBackwardDataAlgo
+{
+    V4R1XDLNHWC,
+    V4R1R2XDLNHWC,
+};
+
+int main(int argc, char* argv[])
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+
+#if USE_DYNAMIC_MODE
+    // dynamic mode
+    if(argc != 22)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
+    const bool do_verification      = atoi(argv[3]);
+    const int init_method           = atoi(argv[4]);
+    const bool do_log               = atoi(argv[5]);
+    const int nrepeat               = atoi(argv[6]);
+
+    const index_t N  = atoi(argv[7]);
+    const index_t K  = atoi(argv[8]);
+    const index_t C  = atoi(argv[9]);
+    const index_t Y  = atoi(argv[10]);
+    const index_t X  = atoi(argv[11]);
+    const index_t Hi = atoi(argv[12]);
+    const index_t Wi = atoi(argv[13]);
+
+    const index_t conv_stride_h   = atoi(argv[14]);
+    const index_t conv_stride_w   = atoi(argv[15]);
+    const index_t conv_dilation_h = atoi(argv[16]);
+    const index_t conv_dilation_w = atoi(argv[17]);
+    const index_t in_left_pad_h   = atoi(argv[18]);
+    const index_t in_left_pad_w   = atoi(argv[19]);
+    const index_t in_right_pad_h  = atoi(argv[20]);
+    const index_t in_right_pad_w  = atoi(argv[21]);
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#else
+    // static mode
+    if(argc < 7)
+    {
+        printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
+        exit(1);
+    }
+
+    const ConvTensorLayout layout   = static_cast<ConvTensorLayout>(atoi(argv[1]));
+    const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2]));
+    const bool do_verification      = atoi(argv[3]);
+    const int init_method           = atoi(argv[4]);
+    const bool do_log               = atoi(argv[5]);
+    const int nrepeat               = atoi(argv[6]);
+
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
+    const index_t conv_dilation_h = 1;
+    const index_t conv_dilation_w = 1;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;
+
+    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+#endif
+
+#if 1
+    constexpr index_t in_vector_size = 1;
+    using in_data_t                  = float;
+    using acc_data_t                 = float;
+    using out_data_t                 = float;
+#elif 1
+    constexpr index_t in_vector_size = 1;
+    using in_data_t                  = half_t;
+    using acc_data_t                 = float;
+    using out_data_t                 = half_t;
+#endif
+
+    std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        // NCHW
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(C);
+        in_lengths_host[2]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[3]  = static_cast<std::size_t>(Wi);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(C);
+        wei_lengths_host[2] = static_cast<std::size_t>(Y);
+        wei_lengths_host[3] = static_cast<std::size_t>(X);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(K);
+        out_lengths_host[2] = static_cast<std::size_t>(Ho);
+        out_lengths_host[3] = static_cast<std::size_t>(Wo);
+        break;
+    case ConvTensorLayout::NHWC:
+        // NHWC
+        in_lengths_host[0]  = static_cast<std::size_t>(N);
+        in_lengths_host[1]  = static_cast<std::size_t>(Hi);
+        in_lengths_host[2]  = static_cast<std::size_t>(Wi);
+        in_lengths_host[3]  = static_cast<std::size_t>(C);
+        wei_lengths_host[0] = static_cast<std::size_t>(K);
+        wei_lengths_host[1] = static_cast<std::size_t>(Y);
+        wei_lengths_host[2] = static_cast<std::size_t>(X);
+        wei_lengths_host[3] = static_cast<std::size_t>(C);
+        out_lengths_host[0] = static_cast<std::size_t>(N);
+        out_lengths_host[1] = static_cast<std::size_t>(Ho);
+        out_lengths_host[2] = static_cast<std::size_t>(Wo);
+        out_lengths_host[3] = static_cast<std::size_t>(K);
+        break;
+    default: throw std::runtime_error("wrong! not implemented");
+    }
+
+    Tensor<in_data_t> in_host(in_lengths_host);
+    Tensor<in_data_t> in_device(in_lengths_host);
+    Tensor<in_data_t> wei(wei_lengths_host);
+    Tensor<out_data_t> out(out_lengths_host);
+
+    std::cout << "layout: " << layout << std::endl;
+    ostream_HostTensorDescriptor(in_host.mDesc, std::cout << "in: ");
+    ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
+    ostream_HostTensorDescriptor(out.mDesc, std::cout << "out: ");
+    print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
+    print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
+    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
+    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            break;
+        case 1:
+            wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            out.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            break;
+        case 2:
+            wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+            out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            break;
+        default:
+            wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+            out.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        }
+    }
+
+    auto f_make_for_device_nchw = [&]() {
+#if USE_DYNAMIC_MODE
+        const auto in_lengths_dev     = make_tuple(N, C, Hi, Wi);
+        const auto wei_lengths_dev    = make_tuple(K, C, Y, X);
+        const auto out_lengths_dev    = make_tuple(N, K, Ho, Wo);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    auto f_make_for_device_nhwc = [&]() {
+#if USE_DYNAMIC_MODE
+        const auto in_lengths_dev     = make_tuple(N, Hi, Wi, C);
+        const auto wei_lengths_dev    = make_tuple(K, Y, X, C);
+        const auto out_lengths_dev    = make_tuple(N, Ho, Wo, K);
+        const auto conv_strides_dev   = make_tuple(conv_stride_h, conv_stride_w);
+        const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
+        const auto in_left_pads_dev   = make_tuple(in_left_pad_h, in_left_pad_w);
+        const auto in_right_pads_dev  = make_tuple(in_right_pad_h, in_right_pad_w);
+#else
+        const auto in_lengths_dev =
+            make_tuple(Number<N>{}, Number<Hi>{}, Number<Wi>{}, Number<C>{});
+        const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<Y>{}, Number<X>{}, Number<C>{});
+        const auto out_lengths_dev =
+            make_tuple(Number<N>{}, Number<Ho>{}, Number<Wo>{}, Number<K>{});
+        const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
+        const auto conv_dilations_dev =
+            make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
+        const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
+        const auto in_right_pads_dev =
+            make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
+#endif
+
+        return make_tuple(in_lengths_dev,
+                          wei_lengths_dev,
+                          out_lengths_dev,
+                          conv_strides_dev,
+                          conv_dilations_dev,
+                          in_left_pads_dev,
+                          in_right_pads_dev);
+    };
+
+    const auto nhwc_desc = f_make_for_device_nhwc();
+
+#if USE_CONV_BWD_V4R1_XDL_NHWC
+    if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        tmp[I3],
+                        tmp[I4],
+                        tmp[I5],
+                        tmp[I6],
+                        in_device,
+                        wei,
+                        out,
+                        nrepeat);
+    }
+#endif
+
+#if USE_CONV_BWD_V4R1R2_XDL_NHWC
+    if(algo == ConvBackwardDataAlgo::V4R1R2XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<
+            in_data_t,
+            acc_data_t,
+            out_data_t>(tmp[I0],
+                        tmp[I1],
+                        tmp[I2],
+                        tmp[I3],
+                        tmp[I4],
+                        tmp[I5],
+                        tmp[I6],
+                        in_device,
+                        wei,
+                        out,
+                        nrepeat);
+    }
+#endif
+
+    if(do_verification)
+    {
+        host_direct_convolution_backward_data(in_host,
+                                              wei,
+                                              out,
+                                              make_tuple(conv_stride_h, conv_stride_w),
+                                              make_tuple(conv_dilation_h, conv_dilation_w),
+                                              make_tuple(in_left_pad_h, in_left_pad_w),
+                                              make_tuple(in_right_pad_h, in_right_pad_w),
+                                              layout);
+
+        check_error(in_host, in_device);
+
+        if(do_log)
+        {
+            LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in_host  : ", in_host.mData, ",") << std::endl;
+            LogRangeAsType<float>(std::cout << "in_device: ", in_device.mData, ",") << std::endl;
+        }
+    }
+}
--- a/driver/conv_driver.cpp
+++ b/driver/conv_driver.cpp
@@ -26,18 +26,32 @@ int main(int argc, char* argv[])
    }

    const bool do_verification = atoi(argv[1]);
-    const int init_method      = atoi(argv[2]);
-    const bool do_log          = atoi(argv[3]);
+    const bool do_log          = atoi(argv[2]);
+    const int init_method      = atoi(argv[3]);
    const int nrepeat          = atoi(argv[4]);

 #if 0
-    constexpr index_t N  = 8;
-    constexpr index_t C  = 8;
-    constexpr index_t Hi = 4;
-    constexpr index_t Wi = 8;
+    constexpr index_t N  = 256;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 16;
+    constexpr index_t WI = 16;
    constexpr index_t K  = 256;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using InLeftPads  = Sequence<0, 0>;
+    using InRightPads = Sequence<0, 0>;
+#elif 0
+    constexpr index_t N  = 1;
+    constexpr index_t C  = 16;
+    constexpr index_t HI = 1080;
+    constexpr index_t WI = 1920;
+    constexpr index_t K  = 16;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;
@@ -162,9 +176,9 @@ int main(int argc, char* argv[])
    // 3x3, 71x71
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;
-    constexpr index_t Hi = 71;
-    constexpr index_t Wi = 71;
-    constexpr index_t K  = 128;
+    constexpr index_t HI = 71;
+    constexpr index_t WI = 71;
+    constexpr index_t K  = 256;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

@@ -430,7 +444,7 @@ int main(int argc, char* argv[])
    using InRightPads = Sequence<0, 0>;
 #elif 0
    // 1x1, 14x14, stride 2
-    constexpr index_t N  = 128;
+    constexpr index_t N  = 256;
    constexpr index_t C  = 1024;
    constexpr index_t Hi = 14;
    constexpr index_t Wi = 14;
@@ -445,7 +459,7 @@ int main(int argc, char* argv[])
    using InRightPads = Sequence<0, 0>;
 #elif 0
    // 1x1, 14x14
-    constexpr index_t N  = 128;
+    constexpr index_t N  = 256;
    constexpr index_t C  = 1024;
    constexpr index_t Hi = 14;
    constexpr index_t Wi = 14;
@@ -636,6 +650,11 @@ int main(int argc, char* argv[])
    using in_data_t                  = typename vector_type<float, in_vector_size>::type;
    using acc_data_t                 = float;
    using out_data_t                 = float;
+#elif 1
+    using in_data_t                  = half_t;
+    constexpr index_t in_vector_size = 1;
+    using acc_data_t                 = float;
+    using out_data_t                 = half_t;
 #elif 0
    constexpr index_t in_vector_size = 1;
    using in_data_t                  = typename vector_type<float, in_vector_size>::type;
--- a/driver/conv_driver_v2.cpp
+++ b/driver/conv_driver_v2.cpp
@@ -16,19 +16,31 @@
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v4r5_nchw_kcyx_nkhw.hpp"
 #include "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp"
+#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"

 #define USE_DYNAMIC_MODE 1
 #define USE_CONV_FWD_V4R4_NCHW 0
 #define USE_CONV_FWD_V4R4_NHWC 0
-#define USE_CONV_FWD_V4R5_NCHW 1
+#define USE_CONV_FWD_V4R5_NCHW 0
 #define USE_CONV_FWD_V5R1_NCHW 0
+#define USE_CONV_FWD_V4R4_XDL_NCHW 0
+#define USE_CONV_FWD_V4R4R2_XDL_NHWC 0
+#define USE_CONV_FWD_V4R4R3_XDL_NHWC 1
+#define USE_CONV_FWD_V4R4R4_XDL_NHWC 1

 enum ConvForwardAlgo
 {
-    V4R4NCHW,
-    V4R4NHWC,
-    V4R5NCHW,
-    V5R1NCHW
+    V4R4NCHW,      // 0
+    V4R4NHWC,      // 1
+    V4R5NCHW,      // 2
+    V5R1NCHW,      // 3
+    V4R4XDLNCHW,   // 4
+    V4R4R2XDLNHWC, // 5
+    V4R4R3XDLNHWC, // 6
+    V4R4R4XDLNHWC  // 7
 };

 int main(int argc, char* argv[])
@@ -97,21 +109,21 @@ int main(int argc, char* argv[])
    const int nrepeat             = atoi(argv[6]);

    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t Hi = 17;
-    constexpr index_t Wi = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
+    constexpr index_t C  = 192;
+    constexpr index_t Hi = 71;
+    constexpr index_t Wi = 71;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;

-    const index_t conv_stride_h   = 1;
-    const index_t conv_stride_w   = 1;
+    const index_t conv_stride_h   = 2;
+    const index_t conv_stride_w   = 2;
    const index_t conv_dilation_h = 1;
    const index_t conv_dilation_w = 1;
-    const index_t in_left_pad_h   = 0;
-    const index_t in_left_pad_w   = 3;
-    const index_t in_right_pad_h  = 0;
-    const index_t in_right_pad_w  = 3;
+    const index_t in_left_pad_h   = 1;
+    const index_t in_left_pad_w   = 1;
+    const index_t in_right_pad_h  = 1;
+    const index_t in_right_pad_w  = 1;

    const index_t YEff = (Y - 1) * conv_dilation_h + 1;
    const index_t XEff = (X - 1) * conv_dilation_w + 1;
@@ -120,11 +132,16 @@ int main(int argc, char* argv[])
    const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
 #endif

-#if 1
+#if 0
    constexpr index_t in_vector_size = 1;
    using in_data_t                  = float;
    using acc_data_t                 = float;
    using out_data_t                 = float;
+#elif 1
+    constexpr index_t in_vector_size = 1;
+    using in_data_t                  = half_t;
+    using acc_data_t                 = float;
+    using out_data_t                 = half_t;
 #elif 1
    constexpr index_t in_vector_size = 16;
    using in_data_t                  = int8_t;
@@ -384,6 +401,114 @@ int main(int argc, char* argv[])
    }
 #endif

+#if USE_CONV_FWD_V4R4_XDL_NCHW
+    if(algo == ConvForwardAlgo::V4R4XDLNCHW)
+    {
+        if(layout != ConvTensorLayout::NCHW)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nchw();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4R2_XDL_NHWC
+    if(algo == ConvForwardAlgo::V4R4R2XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4R3_XDL_NHWC
+    if(algo == ConvForwardAlgo::V4R4R3XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
+#if USE_CONV_FWD_V4R4R4_XDL_NHWC
+    if(algo == ConvForwardAlgo::V4R4R4XDLNHWC)
+    {
+        if(layout != ConvTensorLayout::NHWC)
+        {
+            throw std::runtime_error("wrong! layout");
+        }
+
+        const auto tmp = f_make_for_device_nhwc();
+
+        device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
+                                                                                      acc_data_t,
+                                                                                      out_data_t>(
+            tmp[I0],
+            tmp[I1],
+            tmp[I2],
+            tmp[I3],
+            tmp[I4],
+            tmp[I5],
+            tmp[I6],
+            in,
+            wei,
+            out_device,
+            nrepeat);
+    }
+#endif
+
    if(do_verification)
    {
        host_direct_convolution(in,
@@ -397,6 +522,7 @@ int main(int argc, char* argv[])

        check_error(out_host, out_device);

+#if 0
        if(do_log)
        {
            LogRange(std::cout << "in : ", in.mData, ",") << std::endl;
@@ -404,5 +530,6 @@ int main(int argc, char* argv[])
            LogRange(std::cout << "out_host  : ", out_host.mData, ",") << std::endl;
            LogRange(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
        }
+#endif
    }
 }
--- a/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,340 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    const Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 2;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmM  = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto descs =
+        transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          I0,
+                                                                          I0,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto in_gemmm_gemmn_grid_desc          = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmm
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmm
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmn
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmn
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+
+    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 0+: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 1+: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 2+: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},  // 3+: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 4+: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 5+: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 6+: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}), // 7+: N1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 0-: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 2-: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 3-: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 4-: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 5-: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 6-: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})); // 7-: N1
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
+
+    constexpr auto out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperation::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(in_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<2, 0, 1>,
+            Sequence<0, 2, 1>,
+            1,
+            GemmABlockTransferSrcScalarPerVector_GemmM,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<1, 3, 7, 0, 2, 4, 5, 6>,
+            6,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              wei_gemmk0_gemmm_gemmk1_grid_desc,
+              out_gemmk0_gemmn_gemmk1_grid_desc,
+              in_gemmm_gemmn_grid_desc,
+              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              in_m0_m1_m2_n_grid_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              out_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    in_n_hi_wi_c_device_buf.FromDevice(in_n_hi_wi_c.mData.data());
+}
--- a/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,288 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    const Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 2;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(out_n_ho_wo_k_desc,
+                                                                            wei_k_y_x_c_desc,
+                                                                            in_n_hi_wi_c_desc,
+                                                                            conv_strides,
+                                                                            conv_dilations,
+                                                                            in_left_pads,
+                                                                            in_right_pads,
+                                                                            I0,
+                                                                            I0,
+                                                                            Number<GemmK1>{});
+
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto in_gemmm_gemmn_grid_desc          = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},   // 1+: gemmm
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+        make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: gemmk0
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},   // 1-: gemmm
+                   Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: gemmk1
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 0+: gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1+: gemmn
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}),  // 2+: gemmk1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 0-: Gemmk0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: Gemmn
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 2-: Gemmk1
+
+    constexpr auto in_m0_m1_m2_n_grid_iterator_hacks = make_tuple(
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 0+: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 1+: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 2+: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},  // 3+: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 4+: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 5+: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},  // 6+: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 7+: N1
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: MRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 1-: NRepeat
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 2-: MWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{},   // 3-: NWaves
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 4-: M0
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 5-: M1
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 6-: M2
+            Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{})); // 7-: N1
+
+    constexpr auto out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0>{};
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperation::Set,
+            decltype(out_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(in_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<2, 0, 1>,
+            Sequence<0, 2, 1>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+#if 0
+            Sequence<0, 2, 4, 5, 6, 1, 3, 7>,
+#else
+            Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+#endif
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(out_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(in_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            true // CAccessOrderMRepeatNRepeat
+            >(static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              out_gemmk0_gemmm_gemmk1_grid_desc,
+              wei_gemmk0_gemmn_gemmk1_grid_desc,
+              in_gemmm_gemmn_grid_desc,
+              out_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              in_m0_m1_m2_n_grid_iterator_hacks,
+              out_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    in_n_hi_wi_c_device_buf.FromDevice(in_n_hi_wi_c.mData.data());
+}
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,283 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+#if 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmKPack    = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_KPack = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_KPack = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
+#endif
+
+    const auto descs =
+#if 1
+        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad
+#else
+        transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_1x1
+#endif
+        <TInWei, GemmMPerBlock, GemmNPerBlock, GemmMPerWave, GemmNPerWave, GemmKPack>(
+            wei_k_c_y_x_desc,
+            in_n_c_hi_wi_desc,
+            out_n_k_ho_wo_desc,
+            conv_strides,
+            conv_dilations,
+            in_left_pads,
+            in_right_pads);
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+#if 0
+        float ave_time = launch_kernel_dynamic_gemm_xdlops_v1
+#else
+        float ave_time = launch_kernel_dynamic_gemm_xdlops_v2
+#endif
+        <BlockSize,
+         TInWei,
+         TAcc,
+         TOut,
+         InMemoryDataOperation::Set,
+         decltype(descs[I0]),
+         decltype(descs[I1]),
+         decltype(descs[I2]),
+         decltype(descs[I3]),
+         GemmMPerBlock,
+         GemmNPerBlock,
+         GemmKPerBlock,
+         GemmMPerWave,
+         GemmNPerWave,
+         GemmKPack,
+         MRepeat,
+         NRepeat,
+         GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+         GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+         Sequence<1, 0, 2>,
+         Sequence<1, 0, 2>,
+         2,
+         GemmABlockTransferSrcScalarPerVector_GemmK,
+         GemmABlockTransferDstScalarPerVector_KPack,
+         false, // don't move back src coordinate after threadwise copy
+         GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+         GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+         Sequence<0, 2, 1>,
+         Sequence<1, 0, 2>,
+         1,
+         GemmBBlockTransferSrcScalarPerVector_GemmN,
+         GemmBBlockTransferDstScalarPerVector_KPack,
+         false, // don't move back src coordinate after threadwise copy, which will be fused
+                // with MoveSrcSliceWindow() to save addr computation
+         Sequence<2, 3, 0, 1>,
+         3,
+         GemmCThreadTransferDstScalarPerVector_GemmN1,
+         decltype(descs[I4]),
+         decltype(descs[I5]),
+         decltype(descs[I6]),
+         decltype(descs[I7]),
+         decltype(descs[I8])>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+                              static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+                              static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+                              descs[I0],
+                              descs[I1],
+                              descs[I2],
+                              descs[I3],
+                              descs[I4],
+                              descs[I5],
+                              descs[I6],
+                              descs[I7],
+                              descs[I8],
+                              nrepeat);
+
+        float perf = (float)calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,309 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r2.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw(
+    const InLengths& in_n_c_hi_wi_lengths,
+    const WeiLengths& wei_k_c_y_x_lengths,
+    const OutLengths& out_n_k_ho_wo_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_c_hi_wi,
+    const Tensor<TInWei>& wei_k_c_y_x,
+    Tensor<TOut>& out_n_k_ho_wo,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
+
+    in_n_c_hi_wi_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_k_c_y_x_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_n_k_ho_wo_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
+
+    const auto in_n_c_hi_wi_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
+    const auto wei_k_c_y_x_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
+    const auto out_n_k_ho_wo_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
+
+#if 0
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1    = 8;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 8]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 32, 2>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 8]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 1;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN  = 1;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
+                                                                          in_n_c_hi_wi_desc,
+                                                                          out_n_k_ho_wo_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperation::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            GemmK1,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<0, 2, 1>,
+            Sequence<1, 0, 2>,
+            1,
+            GemmBBlockTransferSrcScalarPerVector_GemmN,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<3, 0, 1, 2>,
+            3,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
+            wei_gemmk0_gemmm_gemmk1_grid_desc,
+            in_gemmk0_gemmn_gemmk1_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+            out_m0_m1_m2_n_grid_iterator_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+            nrepeat);
+
+        float perf = (float)calculate_convolution_flops(
+                         in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
+                     (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+        std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" << std::endl;
+    }
+
+    // copy result back to host
+    out_n_k_ho_wo_device_buf.FromDevice(out_n_k_ho_wo.mData.data());
+}
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,212 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r2.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 1
+    // [M, N, K0, K1] = [256, 128, 4, 8]
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r2<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperation::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1>,
+            2,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks)>(
+            static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+            static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+            wei_gemmk0_gemmm_gemmk1_grid_desc,
+            in_gemmk0_gemmn_gemmk1_grid_desc,
+            out_gemmm_gemmn_grid_desc,
+            wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+            out_m0_m1_m2_n_grid_iterator_hacks,
+            wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+            in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+            nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,277 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 0
+    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#elif 0
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 4;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(wei_k_y_x_c_desc,
+                                                                          in_n_hi_wi_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = descs[I0];
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc  = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperation::Set,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+            6,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              wei_gemmk0_gemmm_gemmk1_grid_desc,
+              in_gemmk0_gemmn_gemmk1_grid_desc,
+              out_gemmm_gemmn_grid_desc,
+              wei_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              out_m0_m1_m2_n_grid_iterator_hacks,
+              wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,364 @@
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "driver_dynamic_gemm_xdlops_v2r3.hpp"
+
+template <typename TInWei,
+          typename TAcc,
+          typename TOut,
+          typename InLengths,
+          typename WeiLengths,
+          typename OutLengths,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
+    const InLengths& in_n_hi_wi_c_lengths,
+    const WeiLengths& wei_k_y_x_c_lengths,
+    const OutLengths& out_n_ho_wo_k_lengths,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const Tensor<TInWei>& in_n_hi_wi_c,
+    const Tensor<TInWei>& wei_k_y_x_c,
+    Tensor<TOut>& out_n_ho_wo_k,
+    ck::index_t nrepeat)
+{
+    using namespace ck;
+
+    std::cout << __func__ << std::endl;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+    constexpr auto I6 = Number<6>{};
+    constexpr auto I7 = Number<7>{};
+    constexpr auto I8 = Number<8>{};
+
+    DeviceMem in_n_hi_wi_c_device_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
+    DeviceMem wei_k_y_x_c_device_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
+    DeviceMem out_n_ho_wo_k_device_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
+
+    in_n_hi_wi_c_device_buf.ToDevice(in_n_hi_wi_c.mData.data());
+    wei_k_y_x_c_device_buf.ToDevice(wei_k_y_x_c.mData.data());
+    out_n_ho_wo_k_device_buf.ToDevice(out_n_ho_wo_k.mData.data());
+
+    const auto in_n_hi_wi_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
+    const auto wei_k_y_x_c_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
+    const auto out_n_ho_wo_k_desc =
+        make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
+
+#if 0
+    // [M, N, K0, K1] = [256, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 0
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 4;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 256, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 256;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 4;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 4, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 4] for fp32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 4;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 4>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 4>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 4;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 4;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [128, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 128;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 32;
+    constexpr index_t GemmNPerWave = 32;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 2;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 2, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#elif 1
+    // [M, N, K0, K1] = [256, 128, 4, 8] for fp16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock = 256;
+    constexpr index_t GemmNPerBlock = 128;
+    constexpr index_t GemmKPerBlock = 4;
+
+    constexpr index_t GemmMPerWave = 64;
+    constexpr index_t GemmNPerWave = 64;
+    constexpr index_t GemmK1       = 8;
+
+    constexpr index_t MRepeat = 2;
+    constexpr index_t NRepeat = 1;
+
+    using GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1   = Sequence<1, 4, 8>;
+    using GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    using GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1   = Sequence<1, 2, 8>;
+    using GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1 = Sequence<4, 64, 1>;
+
+    constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmK1 = 8;
+    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmK1 = 8;
+
+    constexpr index_t GemmCThreadTransferDstScalarPerVector = 1;
+#endif
+
+    const auto descs =
+        transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(in_n_hi_wi_c_desc,
+                                                                          wei_k_y_x_c_desc,
+                                                                          out_n_ho_wo_k_desc,
+                                                                          conv_strides,
+                                                                          conv_dilations,
+                                                                          in_left_pads,
+                                                                          in_right_pads,
+                                                                          Number<GemmK1>{});
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc  = descs[I0];
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = descs[I1];
+    const auto out_gemmm_gemmn_grid_desc         = descs[I2];
+
+    // HACK: hacks that control index calculation when iterating over A, B, C matrix
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},   // 1+: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},   // 1-: GemmM
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})); // 2-: GemmK1
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: GemmN
+                              Sequence<0, 0, 0, 0, 0>{}),  // 2+: GemmK1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: GemmK0
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: GemmN
+                              Sequence<0, 0, 0, 0, 0>{})); // 2-: GemmK1
+
+    constexpr auto out_m0_m1_m2_n_grid_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0+: MRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 1+: NRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 2+: MWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 3+: NWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 4+: M0
+                              Sequence<0, 0, 0, 0, 0>{},   // 5+: M1
+                              Sequence<0, 0, 0, 0, 0>{},   // 6+: M2
+                              Sequence<0, 0, 0, 0, 0>{}),  // 7+: N1
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},   // 0-: MRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 1-: NRepeat
+                              Sequence<0, 0, 0, 0, 0>{},   // 2-: MWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 3-: NWaves
+                              Sequence<0, 0, 0, 0, 0>{},   // 4-: M0
+                              Sequence<0, 0, 0, 0, 0>{},   // 5-: M1
+                              Sequence<0, 0, 0, 0, 0>{},   // 6-: M2
+                              Sequence<0, 0, 0, 0, 0>{})); // 7-: N1
+
+    constexpr auto wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    constexpr auto in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    for(index_t i = 0; i < 5; ++i)
+    {
+        float ave_time = driver_dynamic_gemm_xdlops_v2r3<
+            BlockSize,
+            TInWei,
+            TAcc,
+            TOut,
+            InMemoryDataOperation::Set,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_desc),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_desc),
+            decltype(out_gemmm_gemmn_grid_desc),
+            GemmMPerBlock,
+            GemmNPerBlock,
+            GemmKPerBlock,
+            GemmMPerWave,
+            GemmNPerWave,
+            MRepeat,
+            NRepeat,
+            GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
+            GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmABlockTransferSrcScalarPerVector_GemmK1,
+            GemmABlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
+            GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
+            Sequence<1, 0, 2>,
+            Sequence<1, 0, 2>,
+            2,
+            GemmBBlockTransferSrcScalarPerVector_GemmK1,
+            GemmBBlockTransferDstScalarPerVector_GemmK1,
+            false, // don't move back src coordinate after threadwise copy
+            Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+            7,
+            GemmCThreadTransferDstScalarPerVector,
+            decltype(in_gemmk0_gemmm_gemmk1_grid_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks),
+            decltype(out_m0_m1_m2_n_grid_iterator_hacks),
+            decltype(in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks),
+            decltype(wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks),
+            false // CAccessOrderMRepeatNRepeat
+            >(static_cast<TInWei*>(in_n_hi_wi_c_device_buf.GetDeviceBuffer()),
+              static_cast<TInWei*>(wei_k_y_x_c_device_buf.GetDeviceBuffer()),
+              static_cast<TOut*>(out_n_ho_wo_k_device_buf.GetDeviceBuffer()),
+              in_gemmk0_gemmm_gemmk1_grid_desc,
+              wei_gemmk0_gemmn_gemmk1_grid_desc,
+              out_gemmm_gemmn_grid_desc,
+              in_gemmk0_gemmm_gemmk1_grid_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_iterator_hacks,
+              out_m0_m1_m2_n_grid_iterator_hacks,
+              in_gemmk0_gemmm_gemmk1_grid_move_slice_window_iterator_hacks,
+              wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks,
+              nrepeat);
+
+        {
+            const auto N = out_n_ho_wo_k_lengths[I0];
+            const auto K = out_n_ho_wo_k_lengths[I3];
+            const auto C = wei_k_y_x_c_lengths[I3];
+
+            const auto Hi = in_n_hi_wi_c_lengths[I1];
+            const auto Wi = in_n_hi_wi_c_lengths[I2];
+
+            const auto Ho = out_n_ho_wo_k_lengths[I1];
+            const auto Wo = out_n_ho_wo_k_lengths[I2];
+
+            const auto Y = wei_k_y_x_c_lengths[I1];
+            const auto X = wei_k_y_x_c_lengths[I2];
+
+            float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
+                         (std::size_t(1000) * 1000 * 1000) / ave_time;
+
+            std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
+                      << std::endl;
+        }
+    }
+
+    // copy result back to host
+    out_n_ho_wo_k_device_buf.FromDevice(out_n_ho_wo_k.mData.data());
+}
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
@@ -1,13 +1,13 @@
 #pragma once
 #include "host_tensor.hpp"

-template <class TIn,
-          class TWei,
-          class TOut,
-          class ConvStrides,
-          class ConvDilations,
-          class InLeftPads,
-          class InRightPads>
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
 void host_direct_convolution(const Tensor<TIn>& in,
                             const Tensor<TWei>& wei,
                             Tensor<TOut>& out,
@@ -88,7 +88,7 @@ void host_direct_convolution(const Tensor<TIn>& in,
    }
 }

-template <class TIn, class TWei, class TOut, class InLeftPads, class InRightPads>
+template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>
 void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
                                   const Tensor<TWei>& wei_kcyx,
                                   Tensor<TOut>& out_nkhw,
--- a/driver/include/host_conv_bwd_data.hpp
+++ b/driver/include/host_conv_bwd_data.hpp
@@ -6,56 +6,62 @@ template <typename TIn,
          typename TOut,
          typename ConvStrides,
          typename ConvDilations,
-          typename LeftPads,
-          typename RightPads>
-void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
-                                           const Tensor<TWei>& wei_kcyx,
-                                           const Tensor<TOut>& out_nkhw,
-                                           ConvStrides,
-                                           ConvDilations,
-                                           LeftPads,
-                                           RightPads)
+          typename InLeftPads,
+          typename InRightPads>
+void host_direct_convolution_backward_data(Tensor<TIn>& in,
+                                           const Tensor<TWei>& wei,
+                                           const Tensor<TOut>& out,
+                                           const ConvStrides& conv_strides,
+                                           const ConvDilations& conv_dilations,
+                                           const InLeftPads& in_left_pads,
+                                           const InRightPads& in_right_pads,
+                                           const ConvTensorLayout layout = ConvTensorLayout::NCHW)
 {
    using namespace ck;

-    int N  = in_nchw.mDesc.GetLengths()[0];
-    int C  = in_nchw.mDesc.GetLengths()[1];
-    int HI = in_nchw.mDesc.GetLengths()[2];
-    int WI = in_nchw.mDesc.GetLengths()[3];
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};

-    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
-    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
-    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
+    auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t C  = in.mDesc.GetLengths()[I1];
+        std::size_t Hi = in.mDesc.GetLengths()[I2];
+        std::size_t Wi = in.mDesc.GetLengths()[I3];

-    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
-    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I2];
+        std::size_t X = wei.mDesc.GetLengths()[I3];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I2];
+        std::size_t Wo = out.mDesc.GetLengths()[I3];

-    auto f = [&](auto n, auto c, auto hi, auto wi) {
        double v = 0;

        for(int y = 0; y < Y; ++y)
        {
-            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];

-            if(h_tmp % ConvStrides{}[0] == 0)
+            if(h_tmp % conv_strides[I0] == 0)
            {
-                int ho = h_tmp / ConvStrides{}[0];
+                int ho = h_tmp / conv_strides[I0];

-                if(ho >= 0 && ho < HO)
+                if(ho >= 0 && ho < Ho)
                {
                    for(int x = 0; x < X; ++x)
                    {
-                        int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];

-                        if(w_tmp % ConvStrides{}[1] == 0)
+                        if(w_tmp % conv_strides[I1] == 0)
                        {
-                            int wo = w_tmp / ConvStrides{}[1];
+                            int wo = w_tmp / conv_strides[I1];

-                            if(wo >= 0 && wo < WO)
+                            if(wo >= 0 && wo < Wo)
                            {
                                for(int k = 0; k < K; ++k)
                                {
-                                    v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x);
+                                    v += out(n, k, ho, wo) * wei(k, c, y, x);
                                }
                            }
                        }
@@ -64,14 +70,74 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
            }
        }

-        in_nchw(n, c, hi, wi) = v;
+        in(n, c, hi, wi) = v;
    };

-    auto f_par = make_ParallelTensorFunctor(f,
-                                            in_nchw.mDesc.GetLengths()[0],
-                                            in_nchw.mDesc.GetLengths()[1],
-                                            in_nchw.mDesc.GetLengths()[2],
-                                            in_nchw.mDesc.GetLengths()[3]);
+    auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
+        std::size_t N  = in.mDesc.GetLengths()[I0];
+        std::size_t Hi = in.mDesc.GetLengths()[I1];
+        std::size_t Wi = in.mDesc.GetLengths()[I2];
+        std::size_t C  = in.mDesc.GetLengths()[I3];

-    f_par(std::thread::hardware_concurrency());
+        std::size_t K = wei.mDesc.GetLengths()[I0];
+        std::size_t Y = wei.mDesc.GetLengths()[I1];
+        std::size_t X = wei.mDesc.GetLengths()[I2];
+
+        std::size_t Ho = out.mDesc.GetLengths()[I1];
+        std::size_t Wo = out.mDesc.GetLengths()[I2];
+
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + in_left_pads[I0] - y * conv_dilations[I0];
+
+            if(h_tmp % conv_strides[I0] == 0)
+            {
+                int ho = h_tmp / conv_strides[I0];
+
+                if(ho >= 0 && ho < Ho)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + in_left_pads[I1] - x * conv_dilations[I1];
+
+                        if(w_tmp % conv_strides[I1] == 0)
+                        {
+                            int wo = w_tmp / conv_strides[I1];
+
+                            if(wo >= 0 && wo < Wo)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out(n, ho, wo, k) * wei(k, y, x, c);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in(n, hi, wi, c) = v;
+    };
+
+    switch(layout)
+    {
+    case ConvTensorLayout::NCHW:
+        make_ParallelTensorFunctor(f_nchw,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    case ConvTensorLayout::NHWC:
+        make_ParallelTensorFunctor(f_nhwc,
+                                   in.mDesc.GetLengths()[0],
+                                   in.mDesc.GetLengths()[1],
+                                   in.mDesc.GetLengths()[2],
+                                   in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+        break;
+    default: throw std::runtime_error("wrong! not supported layout");
+    }
 }
--- a/driver/include/host_tensor.hpp
+++ b/driver/include/host_tensor.hpp
@@ -9,7 +9,7 @@
 #include <cassert>
 #include <iostream>

-template <class Range>
+template <typename Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
 {
    bool first = true;
@@ -24,12 +24,27 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
    return os;
 }

+template <typename T, typename Range>
+std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << T{v};
+    }
+    return os;
+}
+
 typedef enum {
    Half  = 0,
    Float = 1,
 } DataType_t;

-template <class T>
+template <typename T>
 struct DataType;

 template <>
@@ -37,13 +52,13 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
 {
 };

-template <class F, class T, std::size_t... Is>
+template <typename F, typename T, std::size_t... Is>
 auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
 {
    return f(std::get<Is>(args)...);
 }

-template <class F, class T>
+template <typename F, typename T>
 auto call_f_unpack_args(F f, T args)
 {
    constexpr std::size_t N = std::tuple_size<T>{};
@@ -51,13 +66,13 @@ auto call_f_unpack_args(F f, T args)
    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
 }

-template <class F, class T, std::size_t... Is>
+template <typename F, typename T, std::size_t... Is>
 auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
 {
    return F(std::get<Is>(args)...);
 }

-template <class F, class T>
+template <typename F, typename T>
 auto construct_f_unpack_args(F, T args)
 {
    constexpr std::size_t N = std::tuple_size<T>{};
@@ -77,13 +92,13 @@ struct HostTensorDescriptor

    void CalculateStrides();

-    template <class Range>
+    template <typename Range>
    HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
    {
        this->CalculateStrides();
    }

-    template <class Range1, class Range2>
+    template <typename Range1, typename Range2>
    HostTensorDescriptor(const Range1& lens, const Range2& strides)
        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
    {
@@ -96,7 +111,7 @@ struct HostTensorDescriptor
    const std::vector<std::size_t>& GetLengths() const;
    const std::vector<std::size_t>& GetStrides() const;

-    template <class... Is>
+    template <typename... Is>
    std::size_t GetOffsetFromMultiIndex(Is... is) const
    {
        assert(sizeof...(Is) == this->GetNumOfDimension());
@@ -111,7 +126,7 @@ struct HostTensorDescriptor

 struct joinable_thread : std::thread
 {
-    template <class... Xs>
+    template <typename... Xs>
    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
    {
    }
@@ -126,7 +141,7 @@ struct joinable_thread : std::thread
    }
 };

-template <class F, class... Xs>
+template <typename F, typename... Xs>
 struct ParallelTensorFunctor
 {
    F mF;
@@ -180,26 +195,26 @@ struct ParallelTensorFunctor
    }
 };

-template <class F, class... Xs>
+template <typename F, typename... Xs>
 auto make_ParallelTensorFunctor(F f, Xs... xs)
 {
    return ParallelTensorFunctor<F, Xs...>(f, xs...);
 }

-template <class T>
+template <typename T>
 struct Tensor
 {
-    template <class X>
+    template <typename X>
    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
    {
    }

-    template <class X>
+    template <typename X>
    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
    {
    }

-    template <class X, class Y>
+    template <typename X, typename Y>
    Tensor(std::vector<X> lens, std::vector<Y> strides)
        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
    {
@@ -207,7 +222,7 @@ struct Tensor

    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

-    template <class G>
+    template <typename G>
    void GenerateTensorValue(G g, std::size_t num_thread = 1)
    {
        switch(mDesc.GetNumOfDimension())
@@ -247,13 +262,13 @@ struct Tensor
        }
    }

-    template <class... Is>
+    template <typename... Is>
    T& operator()(Is... is)
    {
        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
    }

-    template <class... Is>
+    template <typename... Is>
    const T& operator()(Is... is) const
    {
        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
@@ -285,7 +300,7 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s

 void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);

-template <class T>
+template <typename T>
 void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 {
    float error     = 0;
--- a/driver/include/host_tensor_generator.hpp
+++ b/driver/include/host_tensor_generator.hpp
@@ -1,13 +1,14 @@
 #ifndef HOST_TENSOR_GENERATOR_HPP
 #define HOST_TENSOR_GENERATOR_HPP

+#include <cmath>
 #include "config.hpp"

 struct GeneratorTensor_1
 {
    int value = 1;

-    template <class... Is>
+    template <typename... Is>
    double operator()(Is... is)
    {
        return value;
@@ -19,7 +20,7 @@ struct GeneratorTensor_2
    int min_value = 0;
    int max_value = 1;

-    template <class... Is>
+    template <typename... Is>
    double operator()(Is...)
    {
        return (std::rand() % (max_value - min_value)) + min_value;
@@ -28,7 +29,7 @@ struct GeneratorTensor_2

 struct GeneratorTensor_3
 {
-    template <class... Is>
+    template <typename... Is>
    double operator()(Is... is)
    {
        std::array<ck::index_t, sizeof...(Is)> dims = {{static_cast<ck::index_t>(is)...}};
@@ -41,7 +42,7 @@ struct GeneratorTensor_3

 struct GeneratorTensor_Checkboard
 {
-    template <class... Ts>
+    template <typename... Ts>
    double operator()(Ts... Xs) const
    {
        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};