xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 3835318cc3]
2026-05-17 11:30:02 +00:00 · 2021-07-01 14:33:00 -05:00
parent 0d278b8cc8
commit 2331d228e2
54 changed files with 9813 additions and 245 deletions
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,365 @@
+#ifndef CK_DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
+#define CK_DRIVER_DYNAMIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_XDLOPS_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "driver_dynamic_gemm_xdlops_v1.hpp"
+#include "driver_dynamic_gemm_xdlops_v2.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename FloatAB,
+          index_t GemmMPerBlock,
+          index_t GemmNPerBlock,
+          index_t GemmMPerWave,
+          index_t GemmNPerWave,
+          index_t GemmKPack,
+          typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmKPack;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_global_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmm_global_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmKPack)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hip_wip_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_global_desc =
+        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_global_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmn_global_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmKPack)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    assert(GemmM == out_gemmm_gemmn_global_desc.GetLength(I0));
+    assert(GemmN == out_gemmm_gemmn_global_desc.GetLength(I1));
+    assert(GemmK0 == in_gemmk0_gemmn_gemmk1_global_desc.GetLength(I0));
+    assert(GemmK0 == wei_gemmk0_gemmm_gemmk1_global_desc.GetLength(I0));
+
+    assert(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 && GemmK0 % GemmKPerBlock == 0);
+
+    constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, GemmMPerWave, GemmNPerWave, GemmKPack>{};
+
+    constexpr auto CLayout = xdlops_gemm.GetCLayout();
+
+    constexpr index_t M0 = CLayout.M1();
+    constexpr index_t M1 = CLayout.N1();
+    constexpr index_t M2 = CLayout.M0();
+
+    const auto out_m0_m1_m2_n_global_desc = transform_dynamic_tensor_descriptor(
+        out_gemmm_gemmn_global_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmM / (M1 * M2), M1, M2)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}));
+
+    // out_gemm_block_cluster_desc
+    const auto out_gemm_block_cluster_desc = make_cluster_descriptor_v2(
+        make_tuple(GemmM / Number<GemmMPerBlock>{}, GemmN / Number<GemmNPerBlock>{}));
+
+    // hack to control index calculation when iterating over wei_gemmk0_gemmm_gemmk1_global tensor
+    constexpr auto wei_gemmk0_gemmm_gemmk1_global_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    // hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor
+    constexpr auto in_gemmk0_gemmn_gemmk1_global_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_global_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>{};
+
+    // hack to control index calculation when iterating over out_gemmm0_gemmm1_gemmn0_gemmn1_global
+    // tensor hack for NKHW format
+    constexpr auto out_m0_m1_m2_n_global_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_global_desc,
+                      in_gemmk0_gemmn_gemmk1_global_desc,
+                      out_m0_m1_m2_n_global_desc,
+                      out_gemm_block_cluster_desc,
+                      wei_gemmk0_gemmm_gemmk1_global_iterator_hacks,
+                      in_gemmk0_gemmn_gemmk1_global_iterator_hacks,
+                      out_m0_m1_m2_n_global_iterator_hacks,
+                      wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks,
+                      in_gemmk0_gemmn_gemmk1_global_move_slice_window_iterator_hacks);
+}
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename FloatAB,
+          index_t GemmMPerBlock,
+          index_t GemmNPerBlock,
+          index_t GemmMPerWave,
+          index_t GemmNPerWave,
+          index_t GemmKPack,
+          typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4_xdlops_nchw_kcyx_nkhw_1x1(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmKPack;
+
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_global_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmm_global_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmKPack)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_global_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmn_global_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmKPack)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    assert(GemmM == out_gemmm_gemmn_global_desc.GetLength(I0));
+    assert(GemmN == out_gemmm_gemmn_global_desc.GetLength(I1));
+    assert(GemmK0 == in_gemmk0_gemmn_gemmk1_global_desc.GetLength(I0));
+    assert(GemmK0 == wei_gemmk0_gemmm_gemmk1_global_desc.GetLength(I0));
+
+    assert(GemmM % GemmMPerBlock == 0 && GemmN % GemmNPerBlock == 0 && GemmK0 % GemmKPerBlock == 0);
+
+    constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, GemmMPerWave, GemmNPerWave, GemmKPack>{};
+
+    constexpr auto CLayout = xdlops_gemm.GetCLayout();
+
+    constexpr index_t M0 = CLayout.M1();
+    constexpr index_t M1 = CLayout.N1();
+    constexpr index_t M2 = CLayout.M0();
+
+    const auto out_m0_m1_m2_n_global_desc = transform_dynamic_tensor_descriptor(
+        out_gemmm_gemmn_global_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmM / (M1 * M2), M1, M2)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}));
+
+    // out_gemm_block_cluster_desc
+    const auto out_gemm_block_cluster_desc = make_cluster_descriptor_v2(
+        make_tuple(GemmM / Number<GemmMPerBlock>{}, GemmN / Number<GemmNPerBlock>{}));
+
+    // hack to control index calculation when iterating over wei_gemmk0_gemmm_gemmk1_global tensor
+    constexpr auto wei_gemmk0_gemmm_gemmk1_global_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks =
+        Sequence<0, 0, 0, 0, 0>{};
+
+    // hack to control index calculation when iterating over in_gemmk0_gemmn_gemmk1_global tensor
+    constexpr auto in_gemmk0_gemmn_gemmk1_global_iterator_hacks = make_tuple(
+        make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 1, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
+        make_tuple(
+            Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 2, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}));
+
+    constexpr auto in_gemmk0_gemmn_gemmk1_global_move_slice_window_iterator_hacks =
+        Sequence<0, 1, 2, 0, 0>{};
+
+    // hack to control index calculation when iterating over out_gemmm0_gemmm1_gemmn0_gemmn1_global
+    // tensor hack for NKHW format
+    constexpr auto out_m0_m1_m2_n_global_iterator_hacks =
+        make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 1, 0, 0>{}),
+                   make_tuple(Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 0, 0, 0>{},
+                              Sequence<0, 0, 2, 0, 0>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_global_desc,
+                      in_gemmk0_gemmn_gemmk1_global_desc,
+                      out_m0_m1_m2_n_global_desc,
+                      out_gemm_block_cluster_desc,
+                      wei_gemmk0_gemmm_gemmk1_global_iterator_hacks,
+                      in_gemmk0_gemmn_gemmk1_global_iterator_hacks,
+                      out_m0_m1_m2_n_global_iterator_hacks,
+                      wei_gemmk0_gemmm_gemmk1_global_move_slice_window_iterator_hacks,
+                      in_gemmk0_gemmn_gemmk1_global_move_slice_window_iterator_hacks);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v1.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v1.hpp
@@ -0,0 +1,384 @@
+#ifndef CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V1
+#define CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V1
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops.hpp"
+#include "gridwise_operation_wrapper.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPack,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K_M,
+          typename ABlockTransferThreadClusterLengths_K_M,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_M,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N,
+          typename BBlockTransferThreadClusterLengths_K_N,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_N,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+__host__ float launch_kernel_dynamic_gemm_xdlops_v1(const FloatAB* p_a_global,
+                                                    const FloatAB* p_b_global,
+                                                    FloatC* p_c_global,
+                                                    const AGlobalDesc& a_k_m_global_desc,
+                                                    const BGlobalDesc& b_k_n_global_desc,
+                                                    const CGlobalDesc& c_m0_m1_n0_n1_global_desc,
+                                                    const CBlockClusterDesc& c_block_cluster_desc,
+                                                    AGlobalIteratorHacks,
+                                                    BGlobalIteratorHacks,
+                                                    CGlobalIteratorHacks,
+                                                    AGlobalMoveSliceWindowIteratorHacks,
+                                                    BGlobalMoveSliceWindowIteratorHacks,
+                                                    index_t nrepeat)
+
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto M = a_k_m_global_desc.GetLength(I1);
+    const auto N = b_k_n_global_desc.GetLength(I1);
+    const auto K = a_k_m_global_desc.GetLength(I0);
+
+    if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+    {
+        throw std::runtime_error("wrong! GEMM size no divisible");
+    }
+
+    if(!(MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0))
+    {
+        throw std::runtime_error("wrong! GEMM size no divisible");
+    }
+
+    // GEMM
+    using gridwise_gemm =
+        GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1<BlockSize,
+                                                     FloatAB,
+                                                     FloatAcc,
+                                                     FloatC,
+                                                     CGlobalMemoryDataOperation,
+                                                     AGlobalDesc,
+                                                     BGlobalDesc,
+                                                     CGlobalDesc,
+                                                     CBlockClusterDesc,
+                                                     MPerBlock,
+                                                     NPerBlock,
+                                                     KPerBlock,
+                                                     MPerWave,
+                                                     NPerWave,
+                                                     KPack,
+                                                     MRepeat,
+                                                     NRepeat,
+                                                     ABlockTransferThreadSliceLengths_K_M,
+                                                     ABlockTransferThreadClusterLengths_K_M,
+                                                     ABlockTransferThreadClusterArrangeOrder,
+                                                     ABlockTransferSrcAccessOrder,
+                                                     ABlockTransferSrcVectorDim,
+                                                     ABlockTransferSrcScalarPerVector,
+                                                     ABlockTransferDstScalarPerVector_M,
+                                                     AThreadTransferSrcResetCoordinateAfterRun,
+                                                     BBlockTransferThreadSliceLengths_K_N,
+                                                     BBlockTransferThreadClusterLengths_K_N,
+                                                     BBlockTransferThreadClusterArrangeOrder,
+                                                     BBlockTransferSrcAccessOrder,
+                                                     BBlockTransferSrcVectorDim,
+                                                     BBlockTransferSrcScalarPerVector,
+                                                     BBlockTransferDstScalarPerVector_N,
+                                                     BThreadTransferSrcResetCoordinateAfterRun,
+                                                     CThreadTransferSrcDstAccessOrder,
+                                                     CThreadTransferSrcDstVectorDim,
+                                                     CThreadTransferDstScalarPerVector,
+                                                     AGlobalIteratorHacks,
+                                                     BGlobalIteratorHacks,
+                                                     CGlobalIteratorHacks,
+                                                     AGlobalMoveSliceWindowIteratorHacks,
+                                                     BGlobalMoveSliceWindowIteratorHacks>;
+
+    const auto GridSize = (M / MPerBlock) * (N / NPerBlock);
+
+    const bool has_main_k_block_loop = (K + KPerBlock) / (2 * KPerBlock) > 1;
+
+    const bool has_double_tail_k_block_loop = (K / KPerBlock) % 2 == 0;
+
+    std::cerr << "has_main_k_block_loop = " << has_main_k_block_loop
+              << " has_double_tail_k_block_loop = " << has_double_tail_k_block_loop << std::endl;
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          true,
+                                                          true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(GridSize),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_global,
+                                          p_b_global,
+                                          p_c_global,
+                                          a_k_m_global_desc,
+                                          b_k_n_global_desc,
+                                          c_m0_m1_n0_n1_global_desc,
+                                          c_block_cluster_desc);
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          true,
+                                                          false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(GridSize),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_global,
+                                          p_b_global,
+                                          p_c_global,
+                                          a_k_m_global_desc,
+                                          b_k_n_global_desc,
+                                          c_m0_m1_n0_n1_global_desc,
+                                          c_block_cluster_desc);
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          false,
+                                                          true>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(GridSize),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_global,
+                                          p_b_global,
+                                          p_c_global,
+                                          a_k_m_global_desc,
+                                          b_k_n_global_desc,
+                                          c_m0_m1_n0_n1_global_desc,
+                                          c_block_cluster_desc);
+    }
+    else
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          false,
+                                                          false>;
+
+        ave_time = launch_and_time_kernel(kernel,
+                                          nrepeat,
+                                          dim3(GridSize),
+                                          dim3(BlockSize),
+                                          0,
+                                          0,
+                                          p_a_global,
+                                          p_b_global,
+                                          p_c_global,
+                                          a_k_m_global_desc,
+                                          b_k_n_global_desc,
+                                          c_m0_m1_n0_n1_global_desc,
+                                          c_block_cluster_desc);
+    }
+
+    return ave_time;
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k_m_global_desc_device_buf(sizeof(AGlobalDesc));
+    DeviceMem b_k_n_global_desc_device_buf(sizeof(BGlobalDesc));
+    DeviceMem c_m0_m1_n0_n1_global_desc_device_buf(sizeof(CGlobalDesc));
+    DeviceMem c_block_cluster_desc_device_buf(sizeof(c_block_cluster_desc));
+
+    a_k_m_global_desc_device_buf.ToDevice(&a_k_m_global_desc);
+    b_k_n_global_desc_device_buf.ToDevice(&b_k_n_global_desc);
+    c_m0_m1_n0_n1_global_desc_device_buf.ToDevice(&c_m0_m1_n0_n1_global_desc);
+    c_block_cluster_desc_device_buf.ToDevice(&c_block_cluster_desc);
+
+    float ave_time = 0;
+
+    if(has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          true,
+                                                          true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(GridSize),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_global,
+            p_b_global,
+            p_c_global,
+            (void __CONSTANT__*)a_k_m_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)b_k_n_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_m0_m1_n0_n1_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_block_cluster_desc_device_buf.GetDeviceBuffer());
+    }
+    else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          true,
+                                                          false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(GridSize),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_global,
+            p_b_global,
+            p_c_global,
+            (void __CONSTANT__*)a_k_m_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)b_k_n_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_m0_m1_n0_n1_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_block_cluster_desc_device_buf.GetDeviceBuffer());
+    }
+    else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          false,
+                                                          true>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(GridSize),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_global,
+            p_b_global,
+            p_c_global,
+            (void __CONSTANT__*)a_k_m_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)b_k_n_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_m0_m1_n0_n1_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_block_cluster_desc_device_buf.GetDeviceBuffer());
+    }
+    else
+    {
+        const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                          FloatAB,
+                                                          FloatAB,
+                                                          FloatC,
+                                                          remove_reference_t<AGlobalDesc>,
+                                                          remove_reference_t<BGlobalDesc>,
+                                                          remove_reference_t<CGlobalDesc>,
+                                                          remove_reference_t<CBlockClusterDesc>,
+                                                          false,
+                                                          false>;
+
+        ave_time = launch_and_time_kernel(
+            kernel,
+            nrepeat,
+            dim3(GridSize),
+            dim3(BlockSize),
+            0,
+            0,
+            p_a_global,
+            p_b_global,
+            p_c_global,
+            (void __CONSTANT__*)a_k_m_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)b_k_n_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_m0_m1_n0_n1_global_desc_device_buf.GetDeviceBuffer(),
+            (void __CONSTANT__*)c_block_cluster_desc_device_buf.GetDeviceBuffer());
+    }
+
+    return ave_time;
+#endif
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v2.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v2.hpp
@@ -0,0 +1,202 @@
+#ifndef CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V2
+#define CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V2
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops_v2.hpp"
+#include "gridwise_operation_wrapper.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPack,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K_M,
+          typename ABlockTransferThreadClusterLengths_K_M,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_M,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N,
+          typename BBlockTransferThreadClusterLengths_K_N,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_N,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+__host__ float launch_kernel_dynamic_gemm_xdlops_v2(const FloatAB* p_a_global,
+                                                    const FloatAB* p_b_global,
+                                                    FloatC* p_c_global,
+                                                    const AGlobalDesc& a_k_m_global_desc,
+                                                    const BGlobalDesc& b_k_n_global_desc,
+                                                    const CGlobalDesc& c_m0_m1_n0_n1_global_desc,
+                                                    const CBlockClusterDesc& c_block_cluster_desc,
+                                                    AGlobalIteratorHacks,
+                                                    BGlobalIteratorHacks,
+                                                    CGlobalIteratorHacks,
+                                                    AGlobalMoveSliceWindowIteratorHacks,
+                                                    BGlobalMoveSliceWindowIteratorHacks,
+                                                    index_t nrepeat)
+
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto M = a_k_m_global_desc.GetLength(I1);
+    const auto N = b_k_n_global_desc.GetLength(I1);
+    const auto K = a_k_m_global_desc.GetLength(I0);
+
+    if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+    {
+        throw std::runtime_error("wrong! GEMM size no divisible");
+    }
+
+    if(!(MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0))
+    {
+        throw std::runtime_error("wrong! GEMM size no divisible");
+    }
+
+    // GEMM
+    using gridwise_gemm =
+        GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2<BlockSize,
+                                                     FloatAB,
+                                                     FloatAcc,
+                                                     FloatC,
+                                                     CGlobalMemoryDataOperation,
+                                                     AGlobalDesc,
+                                                     BGlobalDesc,
+                                                     CGlobalDesc,
+                                                     CBlockClusterDesc,
+                                                     MPerBlock,
+                                                     NPerBlock,
+                                                     KPerBlock,
+                                                     MPerWave,
+                                                     NPerWave,
+                                                     KPack,
+                                                     MRepeat,
+                                                     NRepeat,
+                                                     ABlockTransferThreadSliceLengths_K_M,
+                                                     ABlockTransferThreadClusterLengths_K_M,
+                                                     ABlockTransferThreadClusterArrangeOrder,
+                                                     ABlockTransferSrcAccessOrder,
+                                                     ABlockTransferSrcVectorDim,
+                                                     ABlockTransferSrcScalarPerVector,
+                                                     ABlockTransferDstScalarPerVector_M,
+                                                     AThreadTransferSrcResetCoordinateAfterRun,
+                                                     BBlockTransferThreadSliceLengths_K_N,
+                                                     BBlockTransferThreadClusterLengths_K_N,
+                                                     BBlockTransferThreadClusterArrangeOrder,
+                                                     BBlockTransferSrcAccessOrder,
+                                                     BBlockTransferSrcVectorDim,
+                                                     BBlockTransferSrcScalarPerVector,
+                                                     BBlockTransferDstScalarPerVector_N,
+                                                     BThreadTransferSrcResetCoordinateAfterRun,
+                                                     CThreadTransferSrcDstAccessOrder,
+                                                     CThreadTransferSrcDstVectorDim,
+                                                     CThreadTransferDstScalarPerVector,
+                                                     AGlobalIteratorHacks,
+                                                     BGlobalIteratorHacks,
+                                                     CGlobalIteratorHacks,
+                                                     AGlobalMoveSliceWindowIteratorHacks,
+                                                     BGlobalMoveSliceWindowIteratorHacks>;
+
+    const auto GridSize = (M / MPerBlock) * (N / NPerBlock);
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+    float ave_time = 0;
+
+    const auto kernel = kernel_dynamic_gemm_xdlops_v2<gridwise_gemm,
+                                                      FloatAB,
+                                                      FloatAB,
+                                                      FloatC,
+                                                      remove_reference_t<AGlobalDesc>,
+                                                      remove_reference_t<BGlobalDesc>,
+                                                      remove_reference_t<CGlobalDesc>,
+                                                      remove_reference_t<CBlockClusterDesc>>;
+
+    ave_time = launch_and_time_kernel(kernel,
+                                      nrepeat,
+                                      dim3(GridSize),
+                                      dim3(BlockSize),
+                                      0,
+                                      0,
+                                      p_a_global,
+                                      p_b_global,
+                                      p_c_global,
+                                      a_k_m_global_desc,
+                                      b_k_n_global_desc,
+                                      c_m0_m1_n0_n1_global_desc,
+                                      c_block_cluster_desc);
+
+    return ave_time;
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+    DeviceMem a_k_m_global_desc_device_buf(sizeof(AGlobalDesc));
+    DeviceMem b_k_n_global_desc_device_buf(sizeof(BGlobalDesc));
+    DeviceMem c_m0_m1_n0_n1_global_desc_device_buf(sizeof(CGlobalDesc));
+    DeviceMem c_block_cluster_desc_device_buf(sizeof(c_block_cluster_desc));
+
+    a_k_m_global_desc_device_buf.ToDevice(&a_k_m_global_desc);
+    b_k_n_global_desc_device_buf.ToDevice(&b_k_n_global_desc);
+    c_m0_m1_n0_n1_global_desc_device_buf.ToDevice(&c_m0_m1_n0_n1_global_desc);
+    c_block_cluster_desc_device_buf.ToDevice(&c_block_cluster_desc);
+
+    float ave_time = 0;
+
+    const auto kernel = kernel_dynamic_gemm_xdlops_v1<gridwise_gemm,
+                                                      FloatAB,
+                                                      FloatAB,
+                                                      FloatC,
+                                                      remove_reference_t<AGlobalDesc>,
+                                                      remove_reference_t<BGlobalDesc>,
+                                                      remove_reference_t<CGlobalDesc>,
+                                                      remove_reference_t<CBlockClusterDesc>>;
+
+    ave_time = launch_and_time_kernel(
+        kernel,
+        nrepeat,
+        dim3(GridSize),
+        dim3(BlockSize),
+        0,
+        0,
+        p_a_global,
+        p_b_global,
+        p_c_global,
+        (void __CONSTANT__*)a_k_m_global_desc_device_buf.GetDeviceBuffer(),
+        (void __CONSTANT__*)b_k_n_global_desc_device_buf.GetDeviceBuffer(),
+        (void __CONSTANT__*)c_m0_m1_n0_n1_global_desc_device_buf.GetDeviceBuffer(),
+        (void __CONSTANT__*)c_block_cluster_desc_device_buf.GetDeviceBuffer());
+
+    return ave_time;
+#endif
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v2r2.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v2r2.hpp
@@ -0,0 +1,167 @@
+#ifndef CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V2R2
+#define CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V2R2
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops_v2r2.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+__host__ float driver_dynamic_gemm_xdlops_v2r2(const FloatAB* p_a_grid,
+                                               const FloatAB* p_b_grid,
+                                               FloatC* p_c_grid,
+                                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                               const CMNGridDesc& c_m_n_grid_desc,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks,
+                                               index_t nrepeat)
+
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r2<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       CGlobalMemoryDataOperation,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks>;
+
+    {
+        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
+                  << a_k0_m_k1_grid_desc.GetLength(I1) << ", " << a_k0_m_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "b_k0_n_k1_grid_desc{" << b_k0_n_k1_grid_desc.GetLength(I0) << ", "
+                  << b_k0_n_k1_grid_desc.GetLength(I1) << ", " << b_k0_n_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "c_m_n_grid_desc{ " << c_m_n_grid_desc.GetLength(I0) << ", "
+                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
+    }
+
+    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error(
+            "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r2 has invalid setting");
+    }
+
+    const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc);
+
+    const auto c_block_cluster_adaptor = GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
+
+    const auto kernel = kernel_dynamic_gemm_xdlops_v2r2<GridwiseGemm,
+                                                        FloatAB,
+                                                        FloatC,
+                                                        remove_reference_t<AK0MK1GridDesc>,
+                                                        remove_reference_t<BK0NK1GridDesc>,
+                                                        remove_reference_t<CM0M1M2NGridDesc>,
+                                                        remove_reference_t<CBlockClusterAdaptor>>;
+
+    float ave_time = launch_and_time_kernel(kernel,
+                                            nrepeat,
+                                            dim3(grid_size),
+                                            dim3(BlockSize),
+                                            0,
+                                            0,
+                                            p_a_grid,
+                                            p_b_grid,
+                                            p_c_grid,
+                                            a_k0_m_k1_grid_desc,
+                                            b_k0_n_k1_grid_desc,
+                                            c_m0_m1_m2_n_grid_desc,
+                                            c_block_cluster_adaptor);
+
+    return ave_time;
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_gemm_xdlops_v2r3.hpp
@@ -0,0 +1,169 @@
+#ifndef CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+#define CK_DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks,
+          bool CAccessOrderMRepeatNRepeat>
+__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
+                                               const FloatAB* p_b_grid,
+                                               FloatC* p_c_grid,
+                                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                                               const CMNGridDesc& c_m_n_grid_desc,
+                                               AGridIteratorHacks,
+                                               BGridIteratorHacks,
+                                               CGridIteratorHacks,
+                                               AGridMoveSliceWindowIteratorHacks,
+                                               BGridMoveSliceWindowIteratorHacks,
+                                               index_t nrepeat)
+
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+    constexpr auto I5 = Number<5>{};
+
+    using GridwiseGemm =
+        GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                       FloatAB,
+                                                       FloatAcc,
+                                                       FloatC,
+                                                       CGlobalMemoryDataOperation,
+                                                       AK0MK1GridDesc,
+                                                       BK0NK1GridDesc,
+                                                       CMNGridDesc,
+                                                       MPerBlock,
+                                                       NPerBlock,
+                                                       KPerBlock,
+                                                       MPerWave,
+                                                       NPerWave,
+                                                       MRepeat,
+                                                       NRepeat,
+                                                       ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                       ABlockTransferThreadClusterArrangeOrder,
+                                                       ABlockTransferSrcAccessOrder,
+                                                       ABlockTransferSrcVectorDim,
+                                                       ABlockTransferSrcScalarPerVector,
+                                                       ABlockTransferDstScalarPerVector_K1,
+                                                       AThreadTransferSrcResetCoordinateAfterRun,
+                                                       BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                       BBlockTransferThreadClusterArrangeOrder,
+                                                       BBlockTransferSrcAccessOrder,
+                                                       BBlockTransferSrcVectorDim,
+                                                       BBlockTransferSrcScalarPerVector,
+                                                       BBlockTransferDstScalarPerVector_K1,
+                                                       BThreadTransferSrcResetCoordinateAfterRun,
+                                                       CThreadTransferSrcDstAccessOrder,
+                                                       CThreadTransferSrcDstVectorDim,
+                                                       CThreadTransferDstScalarPerVector,
+                                                       AGridIteratorHacks,
+                                                       BGridIteratorHacks,
+                                                       CGridIteratorHacks,
+                                                       AGridMoveSliceWindowIteratorHacks,
+                                                       BGridMoveSliceWindowIteratorHacks,
+                                                       CAccessOrderMRepeatNRepeat>;
+
+    {
+        std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
+                  << a_k0_m_k1_grid_desc.GetLength(I1) << ", " << a_k0_m_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "b_k0_n_k1_grid_desc{" << b_k0_n_k1_grid_desc.GetLength(I0) << ", "
+                  << b_k0_n_k1_grid_desc.GetLength(I1) << ", " << b_k0_n_k1_grid_desc.GetLength(I2)
+                  << "}" << std::endl;
+
+        std::cout << "c_m_n_grid_desc{ " << c_m_n_grid_desc.GetLength(I0) << ", "
+                  << c_m_n_grid_desc.GetLength(I1) << "}" << std::endl;
+    }
+
+    if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
+    {
+        throw std::runtime_error(
+            "wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+    }
+
+    const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
+
+    using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc);
+
+    const auto c_block_cluster_adaptor = GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
+
+    using CBlockClusterAdaptor = decltype(c_block_cluster_adaptor);
+
+    const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
+
+    const auto kernel = kernel_dynamic_gemm_xdlops_v2r3<GridwiseGemm,
+                                                        FloatAB,
+                                                        FloatC,
+                                                        remove_reference_t<AK0MK1GridDesc>,
+                                                        remove_reference_t<BK0NK1GridDesc>,
+                                                        remove_reference_t<CM0M1M2NGridDesc>,
+                                                        remove_reference_t<CBlockClusterAdaptor>>;
+
+    float ave_time = launch_and_time_kernel(kernel,
+                                            nrepeat,
+                                            dim3(grid_size),
+                                            dim3(BlockSize),
+                                            0,
+                                            0,
+                                            p_a_grid,
+                                            p_b_grid,
+                                            p_c_grid,
+                                            a_k0_m_k1_grid_desc,
+                                            b_k0_n_k1_grid_desc,
+                                            c_m0_m1_m2_n_grid_desc,
+                                            c_block_cluster_adaptor);
+
+    return ave_time;
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -116,8 +116,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v4r1_nchw_kcyx_nkhw
        constexpr index_t GemmN = N * HTildaSlice * WTildaSlice;

        // GemmK is different for each GEMM
-        index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot;
-        index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot;
+        index_t YDotSlice = math::integer_divide_ceil(Y - iYTilda, YTilda);
+        index_t XDotSlice = math::integer_divide_ceil(X - iXTilda, XTilda);

        index_t GemmK = K * YDotSlice * XDotSlice;

@@ -176,8 +176,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v4r1_nchw_kcyx_nkhw
        constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
        constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);

-        constexpr index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot;
-        constexpr index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot;
+        constexpr index_t YDotSlice = math::integer_divide_ceil(Y - iYTilda, YTilda);
+        constexpr index_t XDotSlice = math::integer_divide_ceil(X - iXTilda, XTilda);

        constexpr index_t HTilda =
            Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk.hpp
@@ -118,8 +118,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v5r1_nhwc_kyxc_nhwk
        constexpr index_t GemmN = N * HTildaSlice * WTildaSlice;

        // GemmK is different for each GEMM
-        index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot;
-        index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot;
+        index_t YDotSlice = math::integer_divide_ceil(Y - iYTilda, YTilda);
+        index_t XDotSlice = math::integer_divide_ceil(X - iXTilda, XTilda);

        index_t GemmK0 = YDotSlice;
        index_t GemmK1 = XDotSlice;
@@ -180,8 +180,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v5r1_nhwc_kyxc_nhwk
        constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
        constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);

-        constexpr index_t YDotSlice = (iYTilda + 1) * YDot <= Y ? YDot : Y % YDot;
-        constexpr index_t XDotSlice = (iXTilda + 1) * XDot <= X ? XDot : X % XDot;
+        constexpr index_t YDotSlice = math::integer_divide_ceil(Y - iYTilda, YTilda);
+        constexpr index_t XDotSlice = math::integer_divide_ceil(X - iXTilda, XTilda);

        constexpr index_t HTilda =
            Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
--- a/composable_kernel/include/kernel_algorithm/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/kernel_algorithm/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,272 @@
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Number of GEMMs = YTilda * XTilda
+// GemmM = C
+// GemmN = N * HTildaSlice * WTildaSlice
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildaValue,
+          index_t IXTildaValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildaValue>,
+    Number<IXTildaValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilda = Number<IYTildaValue>{};
+    constexpr auto IXTilda = Number<IXTildaValue>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+    const auto YTilda = ConvStrideH / GcdStrideDilationH;
+    const auto XTilda = ConvStrideW / GcdStrideDilationW;
+
+    const auto YDot = math::integer_divide_ceil(Y, YTilda);
+    const auto XDot = math::integer_divide_ceil(X, XTilda);
+
+    const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+    // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+    const auto IHTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH);
+    const auto IWTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW);
+
+    const auto IHTildaSliceEnd =
+        math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildaSliceEnd =
+        math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+    const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin;
+    const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin;
+
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda);
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // weight tensor
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilda),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilda),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(IYTilda),
+                                                       make_freeze_transform(IXTilda),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+#if 1
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilda),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilda),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
+        transform_dynamic_tensor_descriptor(
+            out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+
+#if 1
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilda, HTilda),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilda, WTilda),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilda),
+                   make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                   make_freeze_transform(IXTilda),
+                   make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_htildaslice_wtildaslice_c_grid_desc,
+        make_tuple(make_pass_through_transform(C),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice))),
+        make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      out_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/kernel_algorithm/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,275 @@
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: out
+// B: wei
+// C: in
+// Number of GEMMs = YTilda * XTilda
+// GemmM = N * HTildaSlice * WTildaSlice
+// GemmN = C
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildaValue,
+          index_t IXTildaValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildaValue>,
+    Number<IXTildaValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilda = Number<IYTildaValue>{};
+    constexpr auto IXTilda = Number<IXTildaValue>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+    const auto YTilda = ConvStrideH / GcdStrideDilationH;
+    const auto XTilda = ConvStrideW / GcdStrideDilationW;
+
+    const auto YDot = math::integer_divide_ceil(Y, YTilda);
+    const auto XDot = math::integer_divide_ceil(X, XTilda);
+
+    const auto HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+    // only work on HTilda and WTilda that contribute to non-padding area of input tensor
+    const auto IHTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilda - I1)), ConvStrideH);
+    const auto IWTildaSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilda - I1)), ConvStrideW);
+
+    const auto IHTildaSliceEnd =
+        math::min(HTilda, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildaSliceEnd =
+        math::min(WTilda, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+    const auto HTildaSlice = IHTildaSliceEnd - IHTildaSliceBegin;
+    const auto WTildaSlice = IWTildaSliceEnd - IWTildaSliceBegin;
+
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilda, YTilda);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilda, XTilda);
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // A: output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto out_n_ydot_htilda_xdot_wtilda_k_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilda),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilda),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc =
+        transform_dynamic_tensor_descriptor(
+            out_n_ydot_htilda_xdot_wtilda_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+
+#if 1
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        out_n_ydotslice_htildaslice_xdotslice_wtildaslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // B: weight tensor
+    const auto wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilda),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilda),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_dynamic_tensor_descriptor(wei_k_ydot_ytilda_xdot_xtilda_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(IYTilda),
+                                                       make_freeze_transform(IXTilda),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+#if 1
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // C: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilda, HTilda),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilda, WTilda),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_n_htildaslice_wtildaslice_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_ytilda_htilda_xtilda_wtilda_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilda),
+                   make_slice_transform(HTilda, IHTildaSliceBegin, HTildaSlice),
+                   make_freeze_transform(IXTilda),
+                   make_slice_transform(WTilda, IWTildaSliceBegin, WTildaSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_htildaslice_wtildaslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(N, HTildaSlice, WTildaSlice)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -18,9 +18,9 @@ template <typename... Wei,
          typename InLeftPads,
          typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_global_desc,
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
    const ConvStrides& conv_strides,
    const ConvDilations& conv_dilations,
    const InLeftPads& in_left_pads,
@@ -31,18 +31,18 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

-    const auto N = in_n_hi_wi_c_global_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_global_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_global_desc.GetLength(I3);
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);

-    const auto Hi = in_n_hi_wi_c_global_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_global_desc.GetLength(I2);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);

-    const auto Ho = out_n_ho_wo_k_global_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_global_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);

-    const auto Y = wei_k_y_x_c_global_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_global_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);

    const auto ConvStrideH = conv_strides[I0];
    const auto ConvStrideW = conv_strides[I1];
@@ -57,15 +57,15 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
    const auto InRightPadW = in_right_pads[I1];

    // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
        make_tuple(Sequence<0>{}, Sequence<1>{}),
        make_tuple(Sequence<1>{}, Sequence<0>{}));

    // input tensor
-    const auto in_n_hip_wip_c_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_hi_wi_c_global_desc,
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
        make_tuple(make_pass_through_transform(N),
                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
@@ -73,8 +73,8 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

-    const auto in_n_y_ho_x_wo_c_global_desc = transform_dynamic_tensor_descriptor(
-        in_n_hip_wip_c_global_desc,
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
        make_tuple(make_pass_through_transform(N),
                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
@@ -82,22 +82,22 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));

-    const auto in_gemmk_gemmn_global_desc =
-        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_global_desc,
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));

    // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
        make_tuple(Sequence<0>{}, Sequence<1>{}),
        make_tuple(Sequence<1>{}, Sequence<0>{}));

    return make_tuple(
-        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
 }

 template <typename... Wei,
@@ -108,9 +108,9 @@ template <typename... Wei,
          typename InLeftPads,
          typename InRightPads>
 __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
-    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_global_desc,
-    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_global_desc,
-    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_global_desc,
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
    const ConvStrides& conv_strides,
    const ConvDilations& conv_dilations,
    const InLeftPads& in_left_pads,
@@ -121,18 +121,18 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

-    const auto N = in_n_hi_wi_c_global_desc.GetLength(I0);
-    const auto C = in_n_hi_wi_c_global_desc.GetLength(I3);
-    const auto K = out_n_ho_wo_k_global_desc.GetLength(I3);
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);

-    const auto Hi = in_n_hi_wi_c_global_desc.GetLength(I1);
-    const auto Wi = in_n_hi_wi_c_global_desc.GetLength(I2);
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);

-    const auto Ho = out_n_ho_wo_k_global_desc.GetLength(I1);
-    const auto Wo = out_n_ho_wo_k_global_desc.GetLength(I2);
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);

-    const auto Y = wei_k_y_x_c_global_desc.GetLength(I1);
-    const auto X = wei_k_y_x_c_global_desc.GetLength(I2);
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);

    const auto ConvStrideH = conv_strides[I0];
    const auto ConvStrideW = conv_strides[I1];
@@ -151,28 +151,28 @@ __host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_
           InRightPadW == 0);

    // weight tensor
-    const auto wei_gemmk_gemmm_global_desc = transform_dynamic_tensor_descriptor(
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C)),
        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
        make_tuple(Sequence<0>{}, Sequence<1>{}),
        make_tuple(Sequence<1>{}, Sequence<0>{}));

    // input tensor
-    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+    const auto in_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, C)),
        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
        make_tuple(Sequence<0>{}, Sequence<1>{}),
        make_tuple(Sequence<1>{}, Sequence<0>{}));

    // output tensor
-    const auto out_gemmm_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
        make_tuple(Sequence<0>{}, Sequence<1>{}),
        make_tuple(Sequence<1>{}, Sequence<0>{}));

    return make_tuple(
-        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
 }

 } // namespace ck
--- a/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,129 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,129 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
+++ b/composable_kernel/include/kernel_algorithm/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,132 @@
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Ho * Wo
+// GemmN = K
+// GemmK = C * Y * X
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+    const DynamicTensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const DynamicTensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const DynamicTensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = N * Ho * Wo;
+    const auto GemmN  = K;
+    const auto GemmK  = Y * X * C;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // A: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_dynamic_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmm_grid_desc =
+        transform_dynamic_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        in_gemmk_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // B: weight tensor
+    const auto wei_gemmk_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_dynamic_tensor_descriptor(
+        wei_gemmk_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // C: output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_dynamic_tensor_descriptor(
+        make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -1417,6 +1417,7 @@ struct DynamicUnMerge
        printf("DynamicUnMerge, ");
        printf("up_lengths_");
        print_multi_index(up_lengths_);
+        printf("up_lengths_scan_");
        print_multi_index(up_lengths_scan_);
        printf("}");
    }
@@ -1439,12 +1440,12 @@ struct DynamicFreeze

    template <typename LowIdx, typename UpIdx>
    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
-                                                           const UpIdx& idx_up) const
+                                                           const UpIdx& /* idx_up */) const
    {
        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 0,
                      "wrong! inconsistent # of dimension");

-        idx_low = low_idx_;
+        idx_low(Number<0>{}) = low_idx_;
    }

    template <typename LowIdxDiff,
@@ -1453,9 +1454,9 @@ struct DynamicFreeze
              typename UpIdx,
              index_t Hack>
    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
-                                                     const UpIdxDiff& idx_diff_up,
-                                                     LowIdx& idx_low,
-                                                     const UpIdx& idx_up_new,
+                                                     const UpIdxDiff& /* idx_diff_up */,
+                                                     LowIdx& /* idx_low */,
+                                                     const UpIdx& /* idx_up_new */,
                                                     Number<Hack>)
    {
        idx_diff_low(Number<0>{}) = 0;
@@ -1487,6 +1488,73 @@ struct DynamicFreeze
    }
 };

+// Insert a dangling upper dimension without lower dimension
+template <typename UpperLength>
+struct DynamicInsert
+{
+    using UpLengths = decltype(make_tuple(UpperLength{}));
+
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr DynamicInsert() = default;
+
+    __host__ __device__ constexpr DynamicInsert(const UpperLength& up_length)
+        : up_lengths_{make_tuple(up_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 0; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx&, const UpIdx&) const
+    {
+        static_assert(LowIdx::Size() == 0 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void
+    UpdateLowerIndex(LowIdxDiff&, const UpIdxDiff&, LowIdx&, const UpIdx&, Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 0 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 0 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpperLength>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("DynamicInsert");
+        print_multi_index(up_lengths_);
+    }
+};
+
 template <typename VectorSize, typename UpLength>
 struct DynamicVectorize
 {
@@ -1572,5 +1640,99 @@ struct DynamicVectorize
    }
 };

+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+struct DynamicSlice
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(SliceEnd{} - SliceBegin{}));
+
+    UpLengths up_lengths_;
+    SliceBegin slice_begin_;
+    SliceEnd slice_end_;
+
+    __host__ __device__ constexpr DynamicSlice() = default;
+
+    __host__ __device__ constexpr DynamicSlice(const LowLength& low_length,
+                                               const SliceBegin& slice_begin,
+                                               const SliceEnd& slice_end)
+        : up_lengths_{make_tuple(slice_end - slice_begin)},
+          slice_begin_{slice_begin},
+          slice_end_{slice_end}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] + slice_begin_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx& idx_up_new,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<SliceBegin>::value &&
+               is_known_at_compile_time<SliceEnd>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("DynamicSlice, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("slice_begin_ %d", index_t{slice_begin_});
+        printf("slice_end %d", index_t{slice_end_});
+        printf("}");
+    }
+};
+
 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
@@ -85,6 +85,14 @@ __host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_i
    return DynamicFreeze<LowerIndex>{low_idx};
 }

+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+__host__ __device__ constexpr auto make_slice_transform(const LowLength& low_length,
+                                                        const SliceBegin& slice_begin,
+                                                        const SliceEnd& slice_end)
+{
+    return DynamicSlice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
+}
+
 template <typename VectorSize, typename UpLength>
 __host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size,
                                                            const UpLength& up_length)
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
@@ -137,7 +137,7 @@ make_dynamic_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths
                                        math::multiplies_v2{},
                                        Number<stride_n_minus_2>{},
                                        i + I1,
-                                        Number<N - 2>{},
+                                        Number<N - 1>{},
                                        I1);
            }
        },
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -121,7 +121,7 @@ struct Slice
                          SliceEnds::GetSize() == nDim,
                      "wrong! # of dimensions not consistent");

-#if 0 
+#if 0
        // TODO: would not compile, error on constexpr
        static_for<0, nDim, 1>{}([&](auto idim) {
            static_assert(SliceBegins::At(idim) <= SliceEnds::At(idim) &&
--- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp
@@ -184,6 +184,18 @@ struct TensorAdaptor
        return get_container_subset(idx_hidden, BottomDimensionHiddenIds{});
    }

+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &=
+                remove_cv_t<remove_reference_t<decltype(Transforms{}[i])>>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value;
+    }
+
    __host__ __device__ void Print() const
    {
        printf("{");
--- a/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
@@ -0,0 +1,528 @@
+#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP
+#define CK_BLOCKWISE_GEMM_XDLOPS_HPP
+
+#include "common_header.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "xdlops_gemm.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          class ABlockDesc,
+          class BBlockDesc,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPack>
+struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1
+{
+
+    using CIndex = MultiIndex<2>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t M0 = ABlockDesc{}.GetLength(I1);
+    static constexpr index_t M1 = ABlockDesc{}.GetLength(I2);
+
+    static constexpr index_t N0 = BBlockDesc{}.GetLength(I1);
+    static constexpr index_t N1 = BBlockDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, KPack>{};
+
+    static constexpr index_t MWaves = M1 / MPerWave;
+    static constexpr index_t NWaves = N1 / NPerWave;
+
+    static constexpr index_t MRepeat = M0;
+    static constexpr index_t NRepeat = N0;
+
+    __device__ constexpr auto GetCLayout() const { return xdlops_gemm.GetCLayout(); }
+
+    __device__ constexpr auto GetNumBlks() const { return xdlops_gemm.GetCLayout().GetNumBlks(); }
+
+    __device__ constexpr auto GetBlkSize() const { return xdlops_gemm.GetCLayout().GetBlkSize(); }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t m_offset = waveId_m * MPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+        else
+        {
+            const index_t m_offset = waveId_m * MPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t n_offset = waveId_n * NPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+        else
+        {
+            const index_t n_offset = waveId_n * NPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static CIndex
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+
+        const index_t waveId = get_thread_local_1d_id() / WaveSize;
+
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        const index_t waveId_m = waveId / NWaves;
+        const index_t waveId_n = waveId % NWaves;
+
+        const index_t m_offset = m0 * M1 + waveId_m * MPerWave + thread_mtx_on_blk[I0];
+        const index_t n_offset = n0 * N1 + waveId_n * NPerWave + thread_mtx_on_blk[I1];
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    __device__ BlockwiseGemmXdlops_km_kn_m0m1m2n_v1()
+        : a_thread_copy_{CalculateAThreadOriginDataIndex()},
+          b_thread_copy_{CalculateBThreadOriginDataIndex()}
+    {
+        static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ABlockDesc{}.GetLength(I0) == BBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(ABlockDesc{}.GetLength(I3) == BBlockDesc{}.GetLength(I3),
+                      "wrong! KPack dimension not consistent");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(KPack == BBlockDesc{}.GetLength(I3), "KPack is wrong!");
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        static_assert(KPerBlock % xdlops_gemm.KPerXdlops == 0, "KPerBlock is wrong!");
+
+        static_assert(KPack % xdlops_gemm.mfma_type.k_base == 0, "KPack is wrong!");
+    }
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf =
+            make_static_buffer<AddressSpace::Vgpr, FloatAB>(a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf =
+            make_static_buffer<AddressSpace::Vgpr, FloatAB>(b_thread_desc_.GetElementSpaceSize());
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        vector_type<FloatAB, a_thread_desc_.GetElementSpaceSize()> a_thread_vec;
+
+        vector_type<FloatAB, b_thread_desc_.GetElementSpaceSize()> b_thread_vec;
+
+        static_for<0, KPerBlock, xdlops_gemm.KPerXdlops>{}([&](auto k) {
+            // read A
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // read B
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            using mfma_input_type =
+                typename vector_type<FloatAB, xdlops_gemm.mfma_type.k_base>::type;
+
+            static_for<0, a_thread_desc_.GetElementSpaceSize(), 1>{}([&](auto i) {
+                a_thread_vec.template AsType<FloatAB>()(Number<i>{}) = a_thread_buf[Number<i>{}];
+            });
+
+            static_for<0, b_thread_desc_.GetElementSpaceSize(), 1>{}([&](auto i) {
+                b_thread_vec.template AsType<FloatAB>()(Number<i>{}) = b_thread_buf[Number<i>{}];
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                             decltype(b_thread_desc_),
+                                             decltype(c_thread_desc_),
+                                             m0,
+                                             n0>(a_thread_vec.template AsType<mfma_input_type>(),
+                                                 b_thread_vec.template AsType<mfma_input_type>(),
+                                                 c_thread_buf);
+                });
+            });
+        });
+    }
+
+    private:
+    // A[K, M]
+    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<MRepeat>{}, I1, Number<KPack>{}));
+
+    // B[K, N]
+    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<NRepeat>{}, I1, Number<KPack>{}));
+
+    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                ABlockDesc,
+                                                                decltype(a_thread_desc_),
+                                                                Sequence<1, MRepeat, 1, KPack>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                KPack,
+                                                                1>;
+
+    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                BBlockDesc,
+                                                                decltype(b_thread_desc_),
+                                                                Sequence<1, NRepeat, 1, KPack>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                KPack,
+                                                                1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          class ABlockDesc,
+          class BBlockDesc,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPack>
+struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
+{
+
+    using CIndex = MultiIndex<2>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<float, MPerWave, NPerWave, KPack>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t M0 = ABlockDesc{}.GetLength(I1);
+    static constexpr index_t M1 = ABlockDesc{}.GetLength(I2);
+
+    static constexpr index_t N0 = BBlockDesc{}.GetLength(I1);
+    static constexpr index_t N1 = BBlockDesc{}.GetLength(I2);
+
+    static constexpr index_t MWaves = M1 / MPerWave;
+    static constexpr index_t NWaves = N1 / NPerWave;
+
+    static constexpr index_t MRepeat = M0;
+    static constexpr index_t NRepeat = N0;
+
+    __device__ constexpr auto GetCLayout() const { return xdlops_gemm.GetCLayout(); }
+
+    __device__ constexpr auto GetNumBlks() const { return xdlops_gemm.GetCLayout().GetNumBlks(); }
+
+    __device__ constexpr auto GetBlkSize() const { return xdlops_gemm.GetCLayout().GetBlkSize(); }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t m_offset = waveId_m * MPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+        else
+        {
+            const index_t m_offset = waveId_m * MPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, m_offset, 0);
+        }
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+        const index_t waveId    = thread_id / WaveSize;
+        const index_t laneId    = thread_id % WaveSize;
+        const index_t waveId_m  = waveId / NWaves;
+        const index_t waveId_n  = waveId % NWaves;
+
+        if constexpr(xdlops_gemm.IsKReduction)
+        {
+            const index_t n_offset = waveId_n * NPerWave + xdlops_gemm.GetBlkTd(laneId);
+            const index_t k_offset = xdlops_gemm.GetBlkId(laneId);
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+        else
+        {
+            const index_t n_offset = waveId_n * NPerWave + laneId;
+            const index_t k_offset = 0;
+            return make_tuple(k_offset, 0, n_offset, 0);
+        }
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static CIndex
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+
+        const index_t waveId = get_thread_local_1d_id() / WaveSize;
+
+        const auto thread_mtx_on_blk = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        const index_t waveId_m = waveId / NWaves;
+        const index_t waveId_n = waveId % NWaves;
+
+        const index_t m_offset = m0 * M1 + waveId_m * MPerWave + thread_mtx_on_blk[I0];
+        const index_t n_offset = n0 * N1 + waveId_n * NPerWave + thread_mtx_on_blk[I1];
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    __device__ BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline()
+        : a_thread_copy_{CalculateAThreadOriginDataIndex()},
+          b_thread_copy_{CalculateBThreadOriginDataIndex()}
+    {
+        static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ABlockDesc{}.GetLength(I0) == BBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        static_assert(ABlockDesc{}.GetLength(I3) == BBlockDesc{}.GetLength(I3),
+                      "wrong! KPack dimension not consistent");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(KPack == BBlockDesc{}.GetLength(I3), "KPack is wrong!");
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        static_assert(KPerBlock % xdlops_gemm.KPerXdlops == 0, "KPerBlock is wrong!");
+
+        static_assert(KPack % xdlops_gemm.mfma_type.k_base == 0, "KPack is wrong!");
+    }
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf =
+            make_static_buffer<AddressSpace::Vgpr, FloatAB>(a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf =
+            make_static_buffer<AddressSpace::Vgpr, FloatAB>(b_thread_desc_.GetElementSpaceSize());
+
+        constexpr index_t KPerBlock = ABlockDesc{}.GetLength(I0);
+
+        // read A_sub_0
+        a_thread_copy_.Run(ABlockDesc{},
+                           make_tuple(I0, I0, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(BBlockDesc{},
+                           make_tuple(I0, I0, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(BBlockDesc{},
+                           make_tuple(I0, I1, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(ABlockDesc{},
+                           make_tuple(I0, I1, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 0,
+                                 0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 0,
+                                 1>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        static_for<xdlops_gemm.KPerXdlops, KPerBlock, xdlops_gemm.KPerXdlops>{}([&](auto k) {
+            // read A_sub_0
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     1,
+                                     0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // read B_sub_0
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     1,
+                                     1>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // read B_sub_1
+            b_thread_copy_.Run(BBlockDesc{},
+                               make_tuple(k, I1, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_,
+                               make_tuple(I0, I1, I0, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(ABlockDesc{},
+                               make_tuple(k, I1, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I1, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     0,
+                                     0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                     decltype(b_thread_desc_),
+                                     decltype(c_thread_desc_),
+                                     0,
+                                     1>(a_thread_buf, b_thread_buf, c_thread_buf);
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 1,
+                                 0>(a_thread_buf, b_thread_buf, c_thread_buf);
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        xdlops_gemm.template Run<decltype(a_thread_desc_),
+                                 decltype(b_thread_desc_),
+                                 decltype(c_thread_desc_),
+                                 1,
+                                 1>(a_thread_buf, b_thread_buf, c_thread_buf);
+    }
+
+    private:
+    // A[K, M]
+    static constexpr auto a_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<MRepeat>{}, I1, Number<KPack>{}));
+
+    // B[K, N]
+    static constexpr auto b_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(I1, Number<NRepeat>{}, I1, Number<KPack>{}));
+
+    static constexpr auto c_thread_desc_ = make_dynamic_naive_tensor_descriptor_packed_v2(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+    using AThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                ABlockDesc,
+                                                                decltype(a_thread_desc_),
+                                                                Sequence<1, 1, 1, KPack>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                1, // KPack,
+                                                                1>;
+
+    using BThreadCopy = ThreadwiseDynamicTensorSliceTransfer_v4<FloatAB,
+                                                                FloatAB,
+                                                                BBlockDesc,
+                                                                decltype(b_thread_desc_),
+                                                                Sequence<1, 1, 1, KPack>,
+                                                                Sequence<0, 1, 2, 3>,
+                                                                3,
+                                                                1, // KPack,
+                                                                1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_v1r1.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_contraction_v1r1.hpp
@@ -101,6 +101,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};

+    // GM0 and GN0 need to known at compile-time
    static constexpr auto GM0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I0);
    static constexpr auto GN0 = CGM0GM1GN0GN1GridDesc{}.GetLength(I2);

@@ -140,7 +141,7 @@ struct GridwiseDynamicContraction_km0m1_kn0n1_m0m1n0n1_v1r1
    {
        static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
                          is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
-                      "wrong!");
+                      "wrong! GM0 and GN0 need to be known at compile-time");

        const auto GM1 = a_gk_gm0_gm1_grid_desc.GetLength(I2);
        const auto GN1 = b_gk_gn0_gn1_grid_desc.GetLength(I2);
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops.hpp
@@ -0,0 +1,585 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v1(const FloatA* __restrict__ p_a_global,
+                                      const FloatB* __restrict__ p_b_global,
+                                      FloatC* __restrict__ p_c_global,
+                                      const AGlobalDesc a_k0_m_k1_global_desc,
+                                      const BGlobalDesc b_k0_n_k1_global_desc,
+                                      const CGlobalDesc c_m0_m1_m2_n_global_desc,
+                                      const CBlockClusterDesc c_block_cluster_desc)
+{
+    GridwiseGemm::Run(p_a_global,
+                      p_b_global,
+                      p_c_global,
+                      a_k0_m_k1_global_desc,
+                      b_k0_n_k1_global_desc,
+                      c_m0_m1_m2_n_global_desc,
+                      c_block_cluster_desc,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by __CONSTANT__ void pointer
+// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v1(const FloatA* __restrict__ p_a_global,
+                                      const FloatB* __restrict__ p_b_global,
+                                      FloatC* __restrict__ p_c_global,
+                                      const void __CONSTANT__* p_a_k0_m_k1_global_desc,
+                                      const void __CONSTANT__* p_b_k0_n_k1_global_desc,
+                                      const void __CONSTANT__* p_c_m0_m1_m2_n_global_desc,
+                                      const void __CONSTANT__* p_c_block_cluster_desc)
+{
+    // first cast void __CONSTANT__ void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_k0_m_k1_global_desc =
+        *reinterpret_cast<const AGlobalDesc*>((const void*)p_a_k0_m_k1_global_desc);
+    const auto b_k0_n_k1_global_desc =
+        *reinterpret_cast<const BGlobalDesc*>((const void*)p_b_k0_n_k1_global_desc);
+    const auto c_m0_m1_m2_n_global_desc =
+        *reinterpret_cast<const CGlobalDesc*>((const void*)p_c_m0_m1_m2_n_global_desc);
+
+    const auto c_block_cluster_desc =
+        *reinterpret_cast<const CBlockClusterDesc*>((const void*)p_c_block_cluster_desc);
+
+    GridwiseGemm::Run(p_a_global,
+                      p_b_global,
+                      p_c_global,
+                      a_k0_m_k1_global_desc,
+                      b_k0_n_k1_global_desc,
+                      c_m0_m1_m2_n_global_desc,
+                      c_block_cluster_desc,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPack,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K_M_KPack,
+          typename ABlockTransferThreadClusterLengths_K_M_KPack,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_KPack,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N_KPack,
+          typename BBlockTransferThreadClusterLengths_K_N_KPack,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_KPack,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = Number<KPack>{};
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_global,
+                               const FloatAB* __restrict__ p_b_global,
+                               FloatC* __restrict__ p_c_global,
+                               const AGlobalDesc& a_k0_m_k1_global_desc,
+                               const BGlobalDesc& b_k0_n_k1_global_desc,
+                               const CGlobalDesc& c_m0_m1_m2_n_global_desc,
+                               const CBlockClusterDesc& c_block_cluster_desc,
+                               FloatAB* __restrict__ p_shared_block,
+                               integral_constant<bool, HasMainKBlockLoop>,
+                               integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_a_global, a_k0_m_k1_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_b_global, b_k0_n_k1_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_c_global, c_m0_m1_m2_n_global_desc.GetElementSpaceSize());
+
+        const auto K0 = a_k0_m_k1_global_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_global_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_global_desc.GetLength(I1);
+        const auto K1 = b_k0_n_k1_global_desc.GetLength(I2);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_desc.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_global into SGPR
+        const index_t m_block_data_idx_on_global =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_global =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = Number<KPack>{};
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, MPerBlock, KPack>,
+                                                   ABlockTransferThreadSliceLengths_K_M_KPack,
+                                                   ABlockTransferThreadClusterLengths_K_M_KPack,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_k0_m_k1_global_desc),
+                                                   decltype(a_k0_m_k1_block_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   2,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_KPack,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_k0_m_k1_global_desc,
+                make_multi_index(0, m_block_data_idx_on_global, 0),
+                a_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, NPerBlock, KPack>,
+                                                   BBlockTransferThreadSliceLengths_K_N_KPack,
+                                                   BBlockTransferThreadClusterLengths_K_N_KPack,
+                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(b_k0_n_k1_global_desc),
+                                                   decltype(b_k0_n_k1_block_desc),
+                                                   BBlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   BBlockTransferSrcVectorDim,
+                                                   2,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferDstScalarPerVector_KPack,
+                                                   1,
+                                                   1,
+                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                b_k0_n_k1_global_desc,
+                make_multi_index(0, n_block_data_idx_on_global, 0),
+                b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        static_assert(MPerBlock % (MPerWave * MRepeat) == 0 &&
+                          NPerBlock % (NPerWave * NRepeat) == 0,
+                      "wrong!");
+
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            a_k0_m_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MPerBlock / MRepeat>{})),
+                       make_pass_through_transform(Number<KPack>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            b_k0_n_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NPerBlock / NRepeat>{})),
+                       make_pass_through_transform(Number<KPack>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
+                                                 FloatAB,
+                                                 decltype(a_k0_m0_m1_k1_block_desc),
+                                                 decltype(b_k0_n0_n1_k1_block_desc),
+                                                 MPerWave,
+                                                 NPerWave,
+                                                 KPack>{};
+
+        constexpr auto CLayout = blockwise_gemm.GetCLayout();
+
+        constexpr index_t BlkSize   = CLayout.GetBlkSize();
+        constexpr index_t NumBlks   = CLayout.GetNumBlks();
+        constexpr index_t NumXdlops = CLayout.GetNumXdlops();
+
+        constexpr auto c_mr_nr_nx_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, Number<NumXdlops>{}));
+
+        constexpr auto c_blk_nb_bs_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<NumBlks>{}, Number<BlkSize>{}));
+
+        StaticBuffer<AddressSpace::Vgpr,
+                     vector_type<FloatAcc, c_blk_nb_bs_desc.GetElementSpaceSize()>,
+                     c_mr_nr_nx_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_space_size;
+
+        // register allocation for output
+        // auto c_thread_buf = make_static_buffer<AddressSpace::Vgpr, FloatAcc>(
+        // c_m0_m1_n0_n1_thread_desc.GetElementSpaceSize());
+
+        // ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+        // decltype(c_m0_m1_n0_n1_thread_desc),
+        // Sequence<MRepeat, MPerThread, NRepeat, NPerThread>>{}
+        //.Run(c_m0_m1_n0_n1_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k0_m_k1_global_iterator_hacks = AGlobalIteratorHacks{};
+        constexpr auto b_k0_n_k1_global_iterator_hacks = BGlobalIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k0_m_k1_global_move_slice_window_iterator_hack =
+            AGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto b_k0_n_k1_global_move_slice_window_iterator_hack =
+            BGlobalMoveSliceWindowIteratorHacks{};
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_a_block_double, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_b_block_double, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_a_block_double + a_block_space_size, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_b_block_double + b_block_space_size, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_global_desc, a_global_buf, a_k0_m_k1_global_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_global_desc, b_global_buf, b_k0_n_k1_global_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(
+                    a_k0_m_k1_global_desc,
+                    a_block_slice_copy_step,
+                    a_k0_m_k1_global_move_slice_window_iterator_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(
+                    b_k0_n_k1_global_desc,
+                    b_block_slice_copy_step,
+                    b_k0_n_k1_global_move_slice_window_iterator_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k0_m_k1_global_desc, a_global_buf, a_k0_m_k1_global_iterator_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k0_n_k1_global_desc, b_global_buf, b_k0_n_k1_global_iterator_hacks);
+
+                asm volatile("s_nop 0");
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(
+                    a_k0_m_k1_global_desc,
+                    a_block_slice_copy_step,
+                    a_k0_m_k1_global_move_slice_window_iterator_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(
+                    b_k0_n_k1_global_desc,
+                    b_block_slice_copy_step,
+                    b_k0_n_k1_global_move_slice_window_iterator_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k0_m_k1_global_desc, a_global_buf, a_k0_m_k1_global_iterator_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k0_n_k1_global_desc, b_global_buf, b_k0_n_k1_global_iterator_hacks);
+
+                asm volatile("s_nop 0");
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_even_buf);
+
+                k_block_data_begin += 2 * KPerBlock;
+            } while(k_block_data_begin < K0 - 2 * KPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_global_desc,
+                                                a_block_slice_copy_step,
+                                                a_k0_m_k1_global_move_slice_window_iterator_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_global_desc,
+                                                b_block_slice_copy_step,
+                                                b_k0_n_k1_global_move_slice_window_iterator_hack);
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_global_desc, a_global_buf, a_k0_m_k1_global_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_global_desc, b_global_buf, b_k0_n_k1_global_iterator_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(
+                    make_tuple(Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+
+            StaticBuffer<AddressSpace::Vgpr, FloatC, BlkSize> c_blk_buf_;
+
+            static_for<0, MRepeat, 1>{}([&](auto mr_i) {
+                static_for<0, NRepeat, 1>{}([&](auto nr_i) {
+                    static_for<0, NumXdlops, 1>{}([&](auto xdlops_i) {
+                        static_for<0, NumBlks, 1>{}([&](auto blk_i) {
+                            auto c_blk = c_thread_buf[Number<c_mr_nr_nx_desc.CalculateOffset(
+                                make_tuple(mr_i, nr_i, xdlops_i))>{}];
+
+                            static_for<0, BlkSize, 1>{}([&](auto j) {
+                                c_blk_buf_(j) = c_blk.template AsType<FloatAcc>()[Number<
+                                    c_blk_nb_bs_desc.CalculateOffset(make_tuple(blk_i, j))>{}];
+                            });
+
+                            // calculate origin of thread output tensor on global memory
+                            //     blockwise GEMM c matrix starting index
+                            const auto c_thread_mtx_on_block =
+                                blockwise_gemm.CalculateCThreadOriginDataIndex(
+                                    mr_i, nr_i, xdlops_i, blk_i);
+
+                            const index_t m_thread_data_on_global =
+                                m_block_data_idx_on_global + c_thread_mtx_on_block[I0];
+
+                            const index_t n_thread_data_on_global =
+                                n_block_data_idx_on_global + c_thread_mtx_on_block[I1];
+
+                            constexpr auto c_m0_m1_m2_n_global_tensor_iterator_hacks =
+                                CGlobalIteratorHacks{};
+
+                            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                                FloatC,
+                                FloatC,
+                                decltype(c_m0_m1_m2_n_thread_desc),
+                                decltype(c_m0_m1_m2_n_global_desc),
+                                Sequence<M0, 1, M2, 1>,
+                                CThreadTransferSrcDstAccessOrder,
+                                CThreadTransferSrcDstVectorDim,
+                                CThreadTransferDstScalarPerVector,
+                                CGlobalMemoryDataOperation,
+                                1,
+                                true>{c_m0_m1_m2_n_global_desc,
+                                      make_multi_index(m_thread_data_on_global / (M2 * M1),
+                                                       m_thread_data_on_global % (M2 * M1) / M2,
+                                                       m_thread_data_on_global % M2,
+                                                       n_thread_data_on_global)}
+                                .Run(c_m0_m1_m2_n_thread_desc,
+                                     make_tuple(I0, I0, I0, I0),
+                                     c_blk_buf_,
+                                     c_m0_m1_m2_n_global_desc,
+                                     c_global_buf,
+                                     c_m0_m1_m2_n_global_tensor_iterator_hacks);
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_global,
+                               const FloatAB* __restrict__ p_b_global,
+                               FloatC* __restrict__ p_c_global,
+                               const AGlobalDesc& a_k0_m_k1_global_desc,
+                               const BGlobalDesc& b_k0_n_k1_global_desc,
+                               const CGlobalDesc& c_m0_m1_m2_n_global_desc,
+                               const CBlockClusterDesc& c_block_cluster_desc,
+                               integral_constant<bool, HasMainKBlockLoop>,
+                               integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(p_a_global,
+            p_b_global,
+            p_c_global,
+            a_k0_m_k1_global_desc,
+            b_k0_n_k1_global_desc,
+            c_m0_m1_m2_n_global_desc,
+            c_block_cluster_desc,
+            p_shared_block,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2.hpp
@@ -0,0 +1,498 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
+template <typename GridwiseGemm,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v2(const FloatA* __restrict__ p_a_global,
+                                      const FloatB* __restrict__ p_b_global,
+                                      FloatC* __restrict__ p_c_global,
+                                      const AGlobalDesc a_k0_m_k1_global_desc,
+                                      const BGlobalDesc b_k0_n_k1_global_desc,
+                                      const CGlobalDesc c_m0_m1_m2_n_global_desc,
+                                      const CBlockClusterDesc c_block_cluster_desc)
+{
+    GridwiseGemm::Run(p_a_global,
+                      p_b_global,
+                      p_c_global,
+                      a_k0_m_k1_global_desc,
+                      b_k0_n_k1_global_desc,
+                      c_m0_m1_m2_n_global_desc,
+                      c_block_cluster_desc);
+}
+#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
+// pass tensor descriptor by __CONSTANT__ void pointer
+// __CONSTANT__ is needed to inform compiler void pointers in the kernel signature are pointing to
+// non-modifiable parameter address space, so compiler can enable corresponding optimization
+template <typename GridwiseGemm,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v2(const FloatA* __restrict__ p_a_global,
+                                      const FloatB* __restrict__ p_b_global,
+                                      FloatC* __restrict__ p_c_global,
+                                      const void __CONSTANT__* p_a_k0_m_k1_global_desc,
+                                      const void __CONSTANT__* p_b_k0_n_k1_global_desc,
+                                      const void __CONSTANT__* p_c_m0_m1_m2_n_global_desc,
+                                      const void __CONSTANT__* p_c_block_cluster_desc)
+{
+    // first cast void __CONSTANT__ void* to void*
+    // second cast void* to Desc*
+    // the copy constructor of tensor descriptor doesn't take address_space(4)
+    const auto a_k0_m_k1_global_desc =
+        *reinterpret_cast<const AGlobalDesc*>((const void*)p_a_k0_m_k1_global_desc);
+    const auto b_k0_n_k1_global_desc =
+        *reinterpret_cast<const BGlobalDesc*>((const void*)p_b_k0_n_k1_global_desc);
+    const auto c_m0_m1_m2_n_global_desc =
+        *reinterpret_cast<const CGlobalDesc*>((const void*)p_c_m0_m1_m2_n_global_desc);
+
+    const auto c_block_cluster_desc =
+        *reinterpret_cast<const CBlockClusterDesc*>((const void*)p_c_block_cluster_desc);
+
+    GridwiseGemm::Run(p_a_global,
+                      p_b_global,
+                      p_c_global,
+                      a_k0_m_k1_global_desc,
+                      b_k0_n_k1_global_desc,
+                      c_m0_m1_m2_n_global_desc,
+                      c_block_cluster_desc,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+#endif
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          typename CBlockClusterDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t KPack,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K_M_KPack,
+          typename ABlockTransferThreadClusterLengths_K_M_KPack,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_KPack,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N_KPack,
+          typename BBlockTransferThreadClusterLengths_K_N_KPack,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_KPack,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalIteratorHacks,
+          typename BGlobalIteratorHacks,
+          typename CGlobalIteratorHacks,
+          typename AGlobalMoveSliceWindowIteratorHacks,
+          typename BGlobalMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = Number<KPack>{};
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    __device__ static void Run(const FloatAB* __restrict__ p_a_global,
+                               const FloatAB* __restrict__ p_b_global,
+                               FloatC* __restrict__ p_c_global,
+                               const AGlobalDesc& a_k0_m_k1_global_desc,
+                               const BGlobalDesc& b_k0_n_k1_global_desc,
+                               const CGlobalDesc& c_m0_m1_m2_n_global_desc,
+                               const CBlockClusterDesc& c_block_cluster_desc,
+                               FloatAB* __restrict__ p_shared_block)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_a_global, a_k0_m_k1_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_b_global, b_k0_n_k1_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_c_global, c_m0_m1_m2_n_global_desc.GetElementSpaceSize());
+
+        const auto K0 = a_k0_m_k1_global_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_global_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_global_desc.GetLength(I1);
+        const auto K1 = b_k0_n_k1_global_desc.GetLength(I2);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_desc.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_global into SGPR
+        const index_t m_block_data_idx_on_global =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_global =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = Number<KPack>{};
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, Number<KPack>{}), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, MPerBlock, KPack>,
+                                                   ABlockTransferThreadSliceLengths_K_M_KPack,
+                                                   ABlockTransferThreadClusterLengths_K_M_KPack,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_k0_m_k1_global_desc),
+                                                   decltype(a_k0_m_k1_block_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   2,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_KPack,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_k0_m_k1_global_desc,
+                make_multi_index(0, m_block_data_idx_on_global, 0),
+                a_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, NPerBlock, KPack>,
+                                                   BBlockTransferThreadSliceLengths_K_N_KPack,
+                                                   BBlockTransferThreadClusterLengths_K_N_KPack,
+                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(b_k0_n_k1_global_desc),
+                                                   decltype(b_k0_n_k1_block_desc),
+                                                   BBlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   BBlockTransferSrcVectorDim,
+                                                   2,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferDstScalarPerVector_KPack,
+                                                   1,
+                                                   1,
+                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                b_k0_n_k1_global_desc,
+                make_multi_index(0, n_block_data_idx_on_global, 0),
+                b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        static_assert(MPerBlock % (MPerWave * MRepeat) == 0 &&
+                          NPerBlock % (NPerWave * NRepeat) == 0,
+                      "wrong!");
+
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            a_k0_m_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MPerBlock / MRepeat>{})),
+                       make_pass_through_transform(Number<KPack>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            b_k0_n_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NPerBlock / NRepeat>{})),
+                       make_pass_through_transform(Number<KPack>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
+                                                 FloatAB,
+                                                 decltype(a_k0_m0_m1_k1_block_desc),
+                                                 decltype(b_k0_n0_n1_k1_block_desc),
+                                                 MPerWave,
+                                                 NPerWave,
+                                                 KPack>{};
+
+        constexpr auto CLayout = blockwise_gemm.GetCLayout();
+
+        constexpr index_t BlkSize   = CLayout.GetBlkSize();
+        constexpr index_t NumBlks   = CLayout.GetNumBlks();
+        constexpr index_t NumXdlops = CLayout.GetNumXdlops();
+
+        constexpr auto c_mr_nr_nx_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, Number<NumXdlops>{}));
+
+        constexpr auto c_blk_nb_bs_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<NumBlks>{}, Number<BlkSize>{}));
+
+        StaticBuffer<AddressSpace::Vgpr,
+                     vector_type<FloatAcc, c_blk_nb_bs_desc.GetElementSpaceSize()>,
+                     c_mr_nr_nx_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        // register allocation for output
+        // auto c_thread_buf = make_static_buffer<AddressSpace::Vgpr, FloatAcc>(
+        // c_m0_m1_n0_n1_thread_desc.GetElementSpaceSize());
+
+        // ThreadwiseDynamicTensorSliceSet_v1<FloatAcc,
+        // decltype(c_m0_m1_n0_n1_thread_desc),
+        // Sequence<MRepeat, MPerThread, NRepeat, NPerThread>>{}
+        //.Run(c_m0_m1_n0_n1_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k0_m_k1_global_iterator_hacks = AGlobalIteratorHacks{};
+        constexpr auto b_k0_n_k1_global_iterator_hacks = BGlobalIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k0_m_k1_global_move_slice_window_iterator_hack =
+            AGlobalMoveSliceWindowIteratorHacks{};
+        constexpr auto b_k0_n_k1_global_move_slice_window_iterator_hack =
+            BGlobalMoveSliceWindowIteratorHacks{};
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_global_desc, a_global_buf, a_k0_m_k1_global_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_global_desc, b_global_buf, b_k0_n_k1_global_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // main body
+        index_t k_block_data_begin = 0;
+
+        do
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_global_desc,
+                                                a_block_slice_copy_step,
+                                                a_k0_m_k1_global_move_slice_window_iterator_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_global_desc,
+                                                b_block_slice_copy_step,
+                                                b_k0_n_k1_global_move_slice_window_iterator_hack);
+
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_global_desc, a_global_buf, a_k0_m_k1_global_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_global_desc, b_global_buf, b_k0_n_k1_global_iterator_hacks);
+
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            block_sync_lds();
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+
+            k_block_data_begin += KPerBlock;
+        } while(k_block_data_begin < (K0 - KPerBlock));
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(
+                    make_tuple(Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+
+            StaticBuffer<AddressSpace::Vgpr, FloatC, BlkSize> c_blk_buf_;
+
+            static_for<0, MRepeat, 1>{}([&](auto mr_i) {
+                static_for<0, NRepeat, 1>{}([&](auto nr_i) {
+                    static_for<0, NumXdlops, 1>{}([&](auto xdlops_i) {
+                        static_for<0, NumBlks, 1>{}([&](auto blk_i) {
+                            auto c_blk = c_thread_buf[Number<c_mr_nr_nx_desc.CalculateOffset(
+                                make_tuple(mr_i, nr_i, xdlops_i))>{}];
+
+                            static_for<0, BlkSize, 1>{}([&](auto j) {
+                                c_blk_buf_(j) = c_blk.template AsType<FloatAcc>()[Number<
+                                    c_blk_nb_bs_desc.CalculateOffset(make_tuple(blk_i, j))>{}];
+                            });
+
+                            // calculate origin of thread output tensor on global memory
+                            //     blockwise GEMM c matrix starting index
+                            const auto c_thread_mtx_on_block =
+                                blockwise_gemm.CalculateCThreadOriginDataIndex(
+                                    mr_i, nr_i, xdlops_i, blk_i);
+
+                            const index_t m_thread_data_on_global =
+                                m_block_data_idx_on_global + c_thread_mtx_on_block[I0];
+
+                            const index_t n_thread_data_on_global =
+                                n_block_data_idx_on_global + c_thread_mtx_on_block[I1];
+
+                            constexpr auto c_m0_m1_m2_n_global_tensor_iterator_hacks =
+                                CGlobalIteratorHacks{};
+
+                            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                                FloatC,
+                                FloatC,
+                                decltype(c_m0_m1_m2_n_thread_desc),
+                                decltype(c_m0_m1_m2_n_global_desc),
+                                Sequence<M0, 1, M2, 1>,
+                                CThreadTransferSrcDstAccessOrder,
+                                CThreadTransferSrcDstVectorDim,
+                                CThreadTransferDstScalarPerVector,
+                                CGlobalMemoryDataOperation,
+                                1,
+                                true>{c_m0_m1_m2_n_global_desc,
+                                      make_multi_index(m_thread_data_on_global / (M2 * M1),
+                                                       m_thread_data_on_global % (M2 * M1) / M2,
+                                                       m_thread_data_on_global % M2,
+                                                       n_thread_data_on_global)}
+                                .Run(c_m0_m1_m2_n_thread_desc,
+                                     make_tuple(I0, I0, I0, I0),
+                                     c_blk_buf_,
+                                     c_m0_m1_m2_n_global_desc,
+                                     c_global_buf,
+                                     c_m0_m1_m2_n_global_tensor_iterator_hacks);
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    __device__ static void Run(const FloatAB* __restrict__ p_a_global,
+                               const FloatAB* __restrict__ p_b_global,
+                               FloatC* __restrict__ p_c_global,
+                               const AGlobalDesc& a_k0_m_k1_global_desc,
+                               const BGlobalDesc& b_k0_n_k1_global_desc,
+                               const CGlobalDesc& c_m0_m1_m2_n_global_desc,
+                               const CBlockClusterDesc& c_block_cluster_desc)
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(p_a_global,
+            p_b_global,
+            p_c_global,
+            a_k0_m_k1_global_desc,
+            b_k0_n_k1_global_desc,
+            c_m0_m1_m2_n_global_desc,
+            c_block_cluster_desc,
+            p_shared_block);
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r2.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r2.hpp
@@ -0,0 +1,509 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R2_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CM0M1M2NGridDesc,
+          typename CBlockClusterAdaptor>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v2r2(const FloatAB* __restrict__ p_a_grid,
+                                        const FloatAB* __restrict__ p_b_grid,
+                                        FloatC* __restrict__ p_c_grid,
+                                        const AK0MK1GridDesc a_k0_m_k1_grid_desc,
+                                        const BK0NK1GridDesc b_k0_n_k1_grid_desc,
+                                        const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
+                                        const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_block_cluster_adaptor);
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks>
+struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = AK0MK1GridDesc{}.GetLength(I2);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc)
+    {
+        // TODO: turn on this
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
+                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
+               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % KPerBlock == 0) &&
+               (MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0);
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1.value>{};
+
+        constexpr auto CLayout = xdlops_gemm.GetCLayout();
+
+        constexpr auto M0 = Number<CLayout.M1()>{};
+        constexpr auto M1 = Number<CLayout.N1()>{};
+        constexpr auto M2 = Number<CLayout.M0()>{};
+
+        const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(M / (M1 * M2), M1, M2)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}));
+
+        return c_m0_m1_m2_n_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using CM0M1M2NGridDesc     = decltype(MakeCM0M1M2NGridDescriptor(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}));
+
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               FloatAB* __restrict__ p_shared_block,
+                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                               const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_b_grid, b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
+
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, MPerBlock, K1.value>,
+                                                   ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                   ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_k0_m_k1_grid_desc),
+                                                   decltype(a_k0_m_k1_block_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   2,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K1,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_k0_m_k1_grid_desc,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, NPerBlock, K1.value>,
+                                                   BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                   BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(b_k0_n_k1_grid_desc),
+                                                   decltype(b_k0_n_k1_block_desc),
+                                                   BBlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   BBlockTransferSrcVectorDim,
+                                                   2,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferDstScalarPerVector_K1,
+                                                   1,
+                                                   1,
+                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                b_k0_n_k1_grid_desc,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        static_assert(MPerBlock % (MPerWave * MRepeat) == 0 &&
+                          NPerBlock % (NPerWave * NRepeat) == 0,
+                      "wrong!");
+
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            a_k0_m_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MPerBlock / MRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            b_k0_n_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NPerBlock / NRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
+                                                 FloatAB,
+                                                 decltype(a_k0_m0_m1_k1_block_desc),
+                                                 decltype(b_k0_n0_n1_k1_block_desc),
+                                                 MPerWave,
+                                                 NPerWave,
+                                                 K1.value>{};
+
+        constexpr auto CLayout = blockwise_gemm.GetCLayout();
+
+        constexpr index_t BlkSize   = CLayout.GetBlkSize();
+        constexpr index_t NumBlks   = CLayout.GetNumBlks();
+        constexpr index_t NumXdlops = CLayout.GetNumXdlops();
+
+        constexpr auto c_mr_nr_nx_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, Number<NumXdlops>{}));
+
+        constexpr auto c_blk_nb_bs_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<NumBlks>{}, Number<BlkSize>{}));
+
+        StaticBuffer<AddressSpace::Vgpr,
+                     vector_type<FloatAcc, c_blk_nb_bs_desc.GetElementSpaceSize()>,
+                     c_mr_nr_nx_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k0_m_k1_grid_iterator_hacks = AGridIteratorHacks{};
+        constexpr auto b_k0_n_k1_grid_iterator_hacks = BGridIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k0_m_k1_grid_move_slice_window_iterator_hack =
+            AGridMoveSliceWindowIteratorHacks{};
+        constexpr auto b_k0_n_k1_grid_move_slice_window_iterator_hack =
+            BGridMoveSliceWindowIteratorHacks{};
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // main body
+        index_t k_block_data_begin = 0;
+
+        do
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k0_m_k1_grid_move_slice_window_iterator_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k0_n_k1_grid_move_slice_window_iterator_hack);
+
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+
+            block_sync_lds();
+
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            block_sync_lds();
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+
+            k_block_data_begin += KPerBlock;
+        } while(k_block_data_begin < (K0 - KPerBlock));
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(
+                    make_tuple(Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+
+            StaticBuffer<AddressSpace::Vgpr, FloatC, BlkSize> c_blk_buf_;
+
+            static_for<0, MRepeat, 1>{}([&](auto mr_i) {
+                static_for<0, NRepeat, 1>{}([&](auto nr_i) {
+                    static_for<0, NumXdlops, 1>{}([&](auto xdlops_i) {
+                        static_for<0, NumBlks, 1>{}([&](auto blk_i) {
+                            auto c_blk = c_thread_buf[Number<c_mr_nr_nx_desc.CalculateOffset(
+                                make_tuple(mr_i, nr_i, xdlops_i))>{}];
+
+                            static_for<0, BlkSize, 1>{}([&](auto j) {
+                                c_blk_buf_(j) = c_blk.template AsType<FloatAcc>()[Number<
+                                    c_blk_nb_bs_desc.CalculateOffset(make_tuple(blk_i, j))>{}];
+                            });
+
+                            // calculate origin of thread output tensor on global memory
+                            //     blockwise GEMM c matrix starting index
+                            const auto c_thread_mtx_on_block =
+                                blockwise_gemm.CalculateCThreadOriginDataIndex(
+                                    mr_i, nr_i, xdlops_i, blk_i);
+
+                            const index_t m_thread_data_on_grid =
+                                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+                            const index_t n_thread_data_on_grid =
+                                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+                            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks =
+                                CGridIteratorHacks{};
+
+                            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                                FloatC,
+                                FloatC,
+                                decltype(c_m0_m1_m2_n_thread_desc),
+                                decltype(c_m0_m1_m2_n_grid_desc),
+                                Sequence<M0, 1, M2, 1>,
+                                CThreadTransferSrcDstAccessOrder,
+                                CThreadTransferSrcDstVectorDim,
+                                CThreadTransferDstScalarPerVector,
+                                CGlobalMemoryDataOperation,
+                                1,
+                                true>{c_m0_m1_m2_n_grid_desc,
+                                      make_multi_index(m_thread_data_on_grid / (M2 * M1),
+                                                       m_thread_data_on_grid % (M2 * M1) / M2,
+                                                       m_thread_data_on_grid % M2,
+                                                       n_thread_data_on_grid)}
+                                .Run(c_m0_m1_m2_n_thread_desc,
+                                     make_tuple(I0, I0, I0, I0),
+                                     c_blk_buf_,
+                                     c_m0_m1_m2_n_grid_desc,
+                                     c_grid_buf,
+                                     c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+                        });
+                    });
+                });
+            });
+        }
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
@@ -0,0 +1,777 @@
+#ifndef CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
+#define CK_GRIDWISE_DYNAMIC_GEMM_XDLOPS_V2R3_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_xdlops.hpp"
+#include "blockwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_transfer.hpp"
+#include "threadwise_dynamic_tensor_slice_set.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CM0M1M2NGridDesc,
+          typename CBlockClusterAdaptor>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_dynamic_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
+                                        const FloatAB* __restrict__ p_b_grid,
+                                        FloatC* __restrict__ p_c_grid,
+                                        const AK0MK1GridDesc a_k0_m_k1_grid_desc,
+                                        const BK0NK1GridDesc b_k0_n_k1_grid_desc,
+                                        const CM0M1M2NGridDesc c_m0_m1_m2_n_grid_desc,
+                                        const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k0_m_k1_grid_desc,
+                      b_k0_n_k1_grid_desc,
+                      c_m0_m1_m2_n_grid_desc,
+                      c_block_cluster_adaptor);
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperation CGlobalMemoryDataOperation,
+          typename AK0MK1GridDesc,
+          typename BK0NK1GridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerWave,
+          index_t NPerWave,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadSliceLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridIteratorHacks,
+          typename BGridIteratorHacks,
+          typename CGridIteratorHacks,
+          typename AGridMoveSliceWindowIteratorHacks,
+          typename BGridMoveSliceWindowIteratorHacks,
+          bool CAccessOrderMRepeatNRepeat>
+struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = AK0MK1GridDesc{}.GetLength(I2);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                  const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc)
+    {
+        // TODO: turn on this
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K0 == b_k0_n_k1_grid_desc.GetLength(I0) &&
+                K1 == a_k0_m_k1_grid_desc.GetLength(I2) &&
+                K1 == b_k0_n_k1_grid_desc.GetLength(I2)) &&
+               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % KPerBlock == 0) &&
+               (MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0);
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M1M2NGridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerWave, NPerWave, K1.value>{};
+
+        constexpr auto CLayout = xdlops_gemm.GetCLayout();
+
+        constexpr auto M0 = Number<CLayout.M1()>{};
+        constexpr auto M1 = Number<CLayout.N1()>{};
+        constexpr auto M2 = Number<CLayout.M0()>{};
+
+        constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
+        constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
+
+        constexpr auto N0 = Number<CLayout.N1()>{};
+        constexpr auto N1 = Number<CLayout.N0()>{};
+
+        const auto c_m0_m1_m2_n_grid_desc = transform_dynamic_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, M0, M1, M2)),
+                       make_unmerge_transform(make_tuple(NRepeat, NWaves, N1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4, 5, 6>{}, Sequence<1, 3, 7>{}));
+
+        return c_m0_m1_m2_n_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+#if 1
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+#elif 1
+        const auto c_blockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(N0, M0))),
+                                             make_tuple(Sequence<1, 0>{}),
+                                             make_tuple(Sequence<0>{}));
+#endif
+
+        return c_blockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using CM0M1M2NGridDesc     = decltype(MakeCM0M1M2NGridDescriptor(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}));
+
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               FloatAB* __restrict__ p_shared_block,
+                               const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
+                               const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
+                               const CM0M1M2NGridDesc& c_m0_m1_m2_n_grid_desc,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_a_grid, a_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_b_grid, b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpace::Global>(
+            p_c_grid, c_m0_m1_m2_n_grid_desc.GetElementSpaceSize());
+
+        const auto K0 = a_k0_m_k1_grid_desc.GetLength(I0);
+        const auto M  = a_k0_m_k1_grid_desc.GetLength(I1);
+        const auto N  = b_k0_n_k1_grid_desc.GetLength(I1);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k0_m_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k0_n_k1_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, MPerBlock, K1.value>,
+                                                   ABlockTransferThreadSliceLengths_K0_M_K1,
+                                                   ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                   ABlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(a_k0_m_k1_grid_desc),
+                                                   decltype(a_k0_m_k1_block_desc),
+                                                   ABlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   ABlockTransferSrcVectorDim,
+                                                   2,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   ABlockTransferDstScalarPerVector_K1,
+                                                   1,
+                                                   1,
+                                                   AThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                a_k0_m_k1_grid_desc,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseDynamicTensorSliceTransfer_v4<BlockSize,
+                                                   InMemoryDataOperation::Set,
+                                                   Sequence<KPerBlock, NPerBlock, K1.value>,
+                                                   BBlockTransferThreadSliceLengths_K0_N_K1,
+                                                   BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                   BBlockTransferThreadClusterArrangeOrder,
+                                                   FloatAB,
+                                                   FloatAB,
+                                                   decltype(b_k0_n_k1_grid_desc),
+                                                   decltype(b_k0_n_k1_block_desc),
+                                                   BBlockTransferSrcAccessOrder,
+                                                   Sequence<1, 0, 2>,
+                                                   BBlockTransferSrcVectorDim,
+                                                   2,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   BBlockTransferDstScalarPerVector_K1,
+                                                   1,
+                                                   1,
+                                                   BThreadTransferSrcResetCoordinateAfterRun,
+                                                   true>(
+                b_k0_n_k1_grid_desc,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        static_assert(MPerBlock % (MPerWave * MRepeat) == 0 &&
+                          NPerBlock % (NPerWave * NRepeat) == 0,
+                      "wrong!");
+
+        constexpr auto a_k0_m0_m1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            a_k0_m_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<MRepeat>{}, Number<MPerBlock / MRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        constexpr auto b_k0_n0_n1_k1_block_desc = transform_dynamic_tensor_descriptor(
+            b_k0_n_k1_block_desc,
+            make_tuple(make_pass_through_transform(Number<KPerBlock>{}),
+                       make_unmerge_transform(
+                           make_tuple(Number<NRepeat>{}, Number<NPerBlock / NRepeat>{})),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        const auto blockwise_gemm =
+            BlockwiseGemmXdlops_km_kn_m0m1m2n_v1<BlockSize,
+                                                 FloatAB,
+                                                 decltype(a_k0_m0_m1_k1_block_desc),
+                                                 decltype(b_k0_n0_n1_k1_block_desc),
+                                                 MPerWave,
+                                                 NPerWave,
+                                                 K1.value>{};
+
+        constexpr auto CLayout = blockwise_gemm.GetCLayout();
+
+        constexpr index_t BlkSize   = CLayout.GetBlkSize();
+        constexpr index_t NumBlks   = CLayout.GetNumBlks();
+        constexpr index_t NumXdlops = CLayout.GetNumXdlops();
+
+        static_assert(NumBlks == 1 && NumXdlops == 1, "K Reduction Mfma only");
+
+        constexpr auto c_mr_nr_blk_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}));
+
+        StaticBuffer<AddressSpace::Vgpr,
+                     vector_type<FloatAcc, BlkSize>,
+                     c_mr_nr_blk_desc.GetElementSpaceSize()>
+            c_thread_buf;
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k0_m_k1_grid_iterator_hacks = AGridIteratorHacks{};
+        constexpr auto b_k0_n_k1_grid_iterator_hacks = BGridIteratorHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k0_m_k1_grid_move_slice_window_iterator_hack =
+            AGridMoveSliceWindowIteratorHacks{};
+        constexpr auto b_k0_n_k1_grid_move_slice_window_iterator_hack =
+            BGridMoveSliceWindowIteratorHacks{};
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpace::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // main body
+        index_t k_block_data_begin = 0;
+
+        do
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k0_m_k1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k0_m_k1_grid_move_slice_window_iterator_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k0_n_k1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k0_n_k1_grid_move_slice_window_iterator_hack);
+
+            a_blockwise_copy.RunRead(
+                a_k0_m_k1_grid_desc, a_grid_buf, a_k0_m_k1_grid_iterator_hacks);
+
+            block_sync_lds();
+
+            b_blockwise_copy.RunRead(
+                b_k0_n_k1_grid_desc, b_grid_buf, b_k0_n_k1_grid_iterator_hacks);
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            block_sync_lds();
+
+            a_blockwise_copy.RunWrite(a_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_k0_n_k1_block_desc, b_block_buf);
+
+            k_block_data_begin += KPerBlock;
+        } while(k_block_data_begin < (K0 - KPerBlock));
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+#if 0
+        // output: register to global memory
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr index_t N0 = CLayout.N1();
+            constexpr index_t N1 = CLayout.N0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(Number<MRepeat>{},
+                                                                          Number<NRepeat>{},
+                                                                          Number<1>{},
+                                                                          Number<1>{},
+                                                                          Number<M0>{},
+                                                                          Number<1>{},
+                                                                          Number<M2>{},
+                                                                          Number<1>{}));
+
+            StaticBuffer<AddressSpace::Vgpr, FloatC, c_m0_m1_m2_n_thread_desc.GetElementSpaceSize()>
+                c_blk_buf_;
+
+            static_for<0, MRepeat, 1>{}([&](auto mr_i) {
+                static_for<0, NRepeat, 1>{}([&](auto nr_i) {
+                    constexpr auto blk_off =
+                        c_mr_nr_blk_desc.CalculateOffset(make_tuple(mr_i, nr_i));
+
+                    static_for<0, BlkSize, 1>{}([&](auto j) {
+                        c_blk_buf_(Number<blk_off * BlkSize + j>{}) =
+                            c_thread_buf[Number<blk_off>{}]
+                                .template AsType<FloatAcc>()[Number<j>{}];
+                    });
+                });
+            });
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+
+            constexpr index_t MWaves = MPerBlock / (MPerWave * MRepeat);
+            constexpr index_t NWaves = NPerBlock / (NPerWave * NRepeat);
+
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                FloatC,
+                FloatC,
+                decltype(c_m0_m1_m2_n_thread_desc),
+                decltype(c_m0_m1_m2_n_grid_desc),
+                Sequence<MRepeat, NRepeat, 1, 1, M0, 1, M2, 1>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{
+                c_m0_m1_m2_n_grid_desc,
+                make_multi_index(m_thread_data_on_grid / (M2 * M1 * M0 * MWaves),
+                                 n_thread_data_on_grid / (N1 * NWaves),
+                                 m_thread_data_on_grid % (M2 * M1 * M0 * MWaves) / (M2 * M1 * M0),
+                                 n_thread_data_on_grid % (N1 * NWaves) / N1,
+                                 m_thread_data_on_grid % (M2 * M1 * M0) / (M2 * M1),
+                                 m_thread_data_on_grid % (M2 * M1) / M2,
+                                 m_thread_data_on_grid % M2,
+                                 n_thread_data_on_grid % N1)}
+                .Run(c_m0_m1_m2_n_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                     c_blk_buf_,
+                     c_m0_m1_m2_n_grid_desc,
+                     c_grid_buf,
+                     c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+        }
+#else
+        {
+            constexpr index_t M0 = CLayout.M1();
+            constexpr index_t M1 = CLayout.N1();
+            constexpr index_t M2 = CLayout.M0();
+
+            constexpr auto c_m0_m1_m2_n_thread_desc =
+                make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(
+                    I1, I1, I1, I1, Number<M0>{}, Number<1>{}, Number<M2>{}, Number<1>{}));
+
+            StaticBuffer<AddressSpace::Vgpr, FloatC, BlkSize> c_blk_buf_;
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            constexpr auto c_m0_m1_m2_n_grid_tensor_iterator_hacks = CGridIteratorHacks{};
+
+            auto c_thread_copy =
+                ThreadwiseDynamicTensorSliceTransfer_v1r3<FloatC,
+                                                          FloatC,
+                                                          decltype(c_m0_m1_m2_n_thread_desc),
+                                                          decltype(c_m0_m1_m2_n_grid_desc),
+                                                          Sequence<1, 1, 1, 1, M0, 1, M2, 1>,
+                                                          CThreadTransferSrcDstAccessOrder,
+                                                          CThreadTransferSrcDstVectorDim,
+                                                          CThreadTransferDstScalarPerVector,
+                                                          CGlobalMemoryDataOperation,
+                                                          1,
+                                                          true>{
+                    c_m0_m1_m2_n_grid_desc,
+                    make_multi_index(0,
+                                     0,
+                                     0,
+                                     0,
+                                     m_thread_data_on_grid / (M2 * M1),
+                                     m_thread_data_on_grid % (M2 * M1) / M2,
+                                     m_thread_data_on_grid % M2,
+                                     n_thread_data_on_grid)};
+
+            auto init_copy = [&](auto c_thread_idx_) {
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+
+                return c_thread_idx_;
+            };
+
+            auto mrepeat_plus_copy = [&](auto c_thread_idx_) {
+                constexpr auto mrepeat_step_plus = make_multi_index(1, 0, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, mrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            auto nrepeat_plus_copy = [&](auto c_thread_idx_) {
+                constexpr auto nrepeat_step_plus = make_multi_index(0, 1, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, nrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            auto mrepeat_minus_copy = [&](auto c_thread_idx_) {
+                constexpr auto mrepeat_step_plus = make_multi_index(-1, 0, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, mrepeat_step_plus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            auto nrepeat_minus_copy = [&](auto c_thread_idx_) {
+                constexpr auto nrepeat_step_minus = make_multi_index(0, -1, 0, 0, 0, 0, 0, 0);
+                c_thread_copy.MoveDstSliceWindow(c_m0_m1_m2_n_grid_desc, nrepeat_step_minus);
+
+                constexpr auto blk_off = c_mr_nr_blk_desc.CalculateOffset(c_thread_idx_);
+                c_thread_copy.Run(c_m0_m1_m2_n_thread_desc,
+                                  make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                  c_thread_buf[Number<blk_off>{}].template AsType<FloatAcc>(),
+                                  c_m0_m1_m2_n_grid_desc,
+                                  c_grid_buf,
+                                  c_m0_m1_m2_n_grid_tensor_iterator_hacks);
+            };
+
+            static_assert((MRepeat == 4 && NRepeat == 4) or (MRepeat == 4 && NRepeat == 2) or
+                              (MRepeat == 2 && NRepeat == 4) or (MRepeat == 2 && NRepeat == 2) or
+                              (MRepeat == 2 && NRepeat == 1) or (MRepeat == 1 && NRepeat == 2) or
+                              (MRepeat == 1 && NRepeat == 1),
+                          "wrong");
+
+            if constexpr(MRepeat == 4 && NRepeat == 4)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    nrepeat_plus_copy(make_tuple(I0, I3));
+                    mrepeat_plus_copy(make_tuple(I1, I3));
+                    nrepeat_minus_copy(make_tuple(I1, I2));
+                    nrepeat_minus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    nrepeat_plus_copy(make_tuple(I2, I1));
+                    nrepeat_plus_copy(make_tuple(I2, I2));
+                    nrepeat_plus_copy(make_tuple(I2, I3));
+                    mrepeat_plus_copy(make_tuple(I3, I3));
+                    nrepeat_minus_copy(make_tuple(I3, I2));
+                    nrepeat_minus_copy(make_tuple(I3, I1));
+                    nrepeat_minus_copy(make_tuple(I3, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    mrepeat_plus_copy(make_tuple(I3, I0));
+                    nrepeat_plus_copy(make_tuple(I3, I1));
+                    mrepeat_minus_copy(make_tuple(I2, I1));
+                    mrepeat_minus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    mrepeat_plus_copy(make_tuple(I1, I2));
+                    mrepeat_plus_copy(make_tuple(I2, I2));
+                    mrepeat_plus_copy(make_tuple(I3, I2));
+                    nrepeat_plus_copy(make_tuple(I3, I3));
+                    mrepeat_minus_copy(make_tuple(I2, I3));
+                    mrepeat_minus_copy(make_tuple(I1, I3));
+                    mrepeat_minus_copy(make_tuple(I0, I3));
+                }
+            }
+            else if constexpr(MRepeat == 4 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    mrepeat_plus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    nrepeat_plus_copy(make_tuple(I2, I1));
+                    mrepeat_plus_copy(make_tuple(I3, I1));
+                    nrepeat_minus_copy(make_tuple(I3, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    mrepeat_plus_copy(make_tuple(I2, I0));
+                    mrepeat_plus_copy(make_tuple(I3, I0));
+                    nrepeat_plus_copy(make_tuple(I3, I1));
+                    mrepeat_minus_copy(make_tuple(I2, I1));
+                    mrepeat_minus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 4)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    nrepeat_plus_copy(make_tuple(I0, I3));
+                    mrepeat_plus_copy(make_tuple(I1, I3));
+                    nrepeat_minus_copy(make_tuple(I1, I2));
+                    nrepeat_minus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    nrepeat_plus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                    nrepeat_plus_copy(make_tuple(I0, I2));
+                    mrepeat_plus_copy(make_tuple(I1, I2));
+                    nrepeat_plus_copy(make_tuple(I1, I3));
+                    mrepeat_minus_copy(make_tuple(I0, I3));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+
+                if constexpr(CAccessOrderMRepeatNRepeat)
+                {
+                    nrepeat_plus_copy(make_tuple(I0, I1));
+                    mrepeat_plus_copy(make_tuple(I1, I1));
+                    nrepeat_minus_copy(make_tuple(I1, I0));
+                }
+                else
+                {
+                    mrepeat_plus_copy(make_tuple(I1, I0));
+                    nrepeat_plus_copy(make_tuple(I1, I1));
+                    mrepeat_minus_copy(make_tuple(I0, I1));
+                }
+            }
+            else if constexpr(MRepeat == 2 && NRepeat == 1)
+            {
+                init_copy(make_tuple(I0, I0));
+                mrepeat_plus_copy(make_tuple(I1, I0));
+            }
+            else if constexpr(MRepeat == 1 && NRepeat == 2)
+            {
+                init_copy(make_tuple(I0, I0));
+                nrepeat_plus_copy(make_tuple(I0, I1));
+            }
+            else if constexpr(MRepeat == 1 && NRepeat == 1)
+            {
+                init_copy(make_tuple(I0, I0));
+            }
+        }
+#endif
+    }
+}; // namespace ck
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
@@ -101,9 +101,9 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3

        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");

-        static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
-                              remove_cv_t<remove_reference_t<SrcData>>>::value,
-                      "wrong! SrcBuffer data type is wrong");
+        // static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
+        // remove_cv_t<remove_reference_t<SrcData>>>::value,
+        //"wrong! SrcBuffer data type is wrong");

        // SrcDesc and src_slice_origin_idx are known at compile-time
        constexpr auto src_desc             = remove_cv_t<remove_reference_t<SrcDesc>>{};
@@ -1407,7 +1407,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v4
            constexpr auto data_to_origin_disp_idx =
                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
 #endif
-
            // src coordinate
            constexpr auto src_ref_to_data_disp_idx =
                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
--- a/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/xdlops_gemm.hpp
@@ -0,0 +1,802 @@
+#ifndef CK_XDLOPS_GEMM_HPP
+#define CK_XDLOPS_GEMM_HPP
+
+#include "common_header.hpp"
+#include "ConstantMatrixDescriptor.hpp"
+#include "math.hpp"
+#include "amd_xdlops.hpp"
+
+namespace ck {
+
+enum struct mfma_instr
+{
+    /// fp32
+    mfma_f32_32x32x1xf32 = 0,
+    mfma_f32_16x16x1xf32,
+    mfma_f32_4x4x1xf32,
+    mfma_f32_32x32x2xf32, // k reduction
+    mfma_f32_16x16x4xf32, // k reduction
+                          /// fp16
+    mfma_f32_32x32x4f16,
+    mfma_f32_16x16x4f16,
+    mfma_f32_4x4x4f16,
+    mfma_f32_32x32x8f16,  // k reduction
+    mfma_f32_16x16x16f16, // k reduction
+                          /// bfp16
+    mfma_f32_32x32x2bf16,
+    mfma_f32_16x16x2bf16,
+    mfma_f32_4x4x2bf16,
+    mfma_f32_32x32x4bf16, // k reduction
+    mfma_f32_16x16x8bf16, // k reduction
+};
+
+template <mfma_instr instr>
+struct mfma_info;
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x1xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 2;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 1;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x1f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x2xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x2f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x4xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x4f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x1xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 4;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 1;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x1f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+// treat 4x4x1 as a single-blk 4x64 mfma
+template <>
+struct mfma_info<mfma_instr::mfma_f32_4x4x1xf32>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 64;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = 1;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = 4;
+    static constexpr index_t m               = 4;
+    static constexpr index_t n               = 64;
+    static constexpr index_t k               = 1;
+    static constexpr index_t cycles          = 8;
+    static constexpr index_t k_base          = 1;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_4x4x1f32<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x4f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 2;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x4f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x8f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 8;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x8f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x16f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 16;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x16f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x4f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 4;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x4f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_4x4x4f16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 64;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = 1;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = 4;
+    static constexpr index_t m               = 4;
+    static constexpr index_t n               = 64;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 8;
+    static constexpr index_t k_base          = 4;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t COffset,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_4x4x4f16<MPerXdlops, NPerXdlops, COffset>::Run(a, b, reg_c);
+    }
+};
+
+#if 0
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x2bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 2;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_32x32x2bf16<MPerXdlops, NPerXdlops, AStride, BStride>::run(
+            p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_32x32x4bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 4;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 32;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 32;
+    static constexpr index_t n               = 32;
+    static constexpr index_t k               = 4;
+    static constexpr index_t cycles          = 64;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_32x32x4bf16(p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x8bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 8;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_16x16x8bf16(p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_16x16x2bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 16;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = wave_size / num_threads_blk;
+    static constexpr index_t num_output_blks = 4;
+    static constexpr index_t num_regs_xdlops = num_regs_blk * num_output_blks;
+    static constexpr index_t m               = 16;
+    static constexpr index_t n               = 16;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 32;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_16x16x2bf16<MPerXdlops, NPerXdlops>(p_a, p_b, reg_c);
+    }
+};
+
+template <>
+struct mfma_info<mfma_instr::mfma_f32_4x4x2bf16>
+{
+    static constexpr index_t group_size      = 4;
+    static constexpr index_t num_groups_blk  = 1;
+    static constexpr index_t num_regs_blk    = group_size * num_groups_blk;
+    static constexpr index_t num_threads_blk = 64;
+    static constexpr index_t wave_size       = 64;
+    static constexpr index_t num_input_blks  = 1;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t num_regs_xdlops = 4;
+    static constexpr index_t m               = 4;
+    static constexpr index_t n               = 64;
+    static constexpr index_t k               = 2;
+    static constexpr index_t cycles          = 8;
+    static constexpr index_t k_base          = 2;
+
+    template <index_t MPerXdlops,
+              index_t NPerXdlops,
+              index_t AStride,
+              index_t BStride,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ FloatC run(const FloatA* a, const FloatB* b, FloatC reg_c) const
+    {
+        const auto p_a = reinterpret_cast<const ushort2_t*>(a);
+        const auto p_b = reinterpret_cast<const ushort2_t*>(b);
+
+        return intrin_mfma_f32_4x4x2bf16<MPerXdlops, NPerXdlops>::run(p_a, p_b, reg_c);
+    }
+};
+#endif
+
+template <mfma_instr instr, index_t MPerXdlops_, index_t NPerXdlops_>
+struct xdlops_info
+{
+    static constexpr auto mfma_type = mfma_info<instr>{};
+
+    static constexpr index_t MPerXdlops = MPerXdlops_;
+    static constexpr index_t NPerXdlops = NPerXdlops_;
+
+    static constexpr bool IsABroadcast()
+    {
+        static_assert(NPerXdlops >= MPerXdlops, "only support ABroadcast");
+        return true;
+    }
+
+    static constexpr bool IsKReduction()
+    {
+        return (mfma_type.num_output_blks == 1) && (mfma_type.num_input_blks > 1);
+    }
+
+    static constexpr index_t GetKPerXdlops()
+    {
+        return IsKReduction() ? mfma_type.num_input_blks : 1;
+    }
+
+    static constexpr index_t GetNumCRegs() { return MPerXdlops * NPerXdlops / mfma_type.wave_size; }
+};
+
+template <class base_type, index_t MPerWave, index_t NPerWave, index_t KPack>
+struct XdlopsGemm
+{
+    template <class base_type_  = base_type,
+              index_t MPerWave_ = MPerWave,
+              index_t NPerWave_ = NPerWave>
+    static constexpr auto GetXdlopsInfo();
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 64, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x1xf32, 64, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 32, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x1xf32, 32, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 16, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x1xf32, 16, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 8, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x1xf32, 8, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 4, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x1xf32, 4, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 32, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2xf32, 32, 32>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<float, 16, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x4xf32, 16, 16>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 64, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x4f16, 64, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 32, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x4f16, 32, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 32, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x8f16, 32, 32>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 16, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x16f16, 16, 16>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 16, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x4f16, 16, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 8, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x4f16, 8, 64>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<half_t, 4, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x4f16, 4, 64>{};
+    }
+
+#if 0
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 128, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 64, 2, 1, c_vec32_4_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 128>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 64, 1, 2, c_vec32_4_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 64, 1, 1, c_vec32_2_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 64, 32, 1, 1, c_vec32_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 32, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x2bf16, 32, 64, 1, 1, c_vec32_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 64, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x2bf16, 64, 16, 1, 1, c_vec16_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 16, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x2bf16, 16, 64, 1, 1, c_vec16_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 8, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x2bf16, 8, 64, 1, 1, c_vec4_2_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 4, 64>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_4x4x2bf16, 4, 64, 1, 1, c_vec4_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 32, 32>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_32x32x4bf16, 32, 32, 1, 1, c_vec16_1_t>{};
+    }
+
+    template <>
+    static constexpr auto GetXdlopsInfo<ushort, 16, 16>()
+    {
+        return xdlops_info<mfma_instr::mfma_f32_16x16x8bf16, 16, 16, 1, 1, c_vec4_1_t>{};
+    }
+#endif
+
+    using CIndex = MultiIndex<2>;
+
+    __device__ static constexpr index_t GetNumBlks() { return mfma_type.num_output_blks; }
+
+    __device__ static constexpr index_t GetNumXdlops()
+    {
+        return MPerXdlops * NPerXdlops / (mfma_type.m * mfma_type.n * mfma_type.num_output_blks);
+    }
+
+    __host__ __device__ constexpr XdlopsGemm()
+    {
+        static_assert(NPerXdlops == 4 || NPerXdlops == 8 || NPerXdlops == 16 || NPerXdlops == 32 ||
+                          NPerXdlops == 64,
+                      "Only support GemmNPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
+
+        static_assert(MPerXdlops == 4 || MPerXdlops == 8 || MPerXdlops == 16 || MPerXdlops == 32 ||
+                          MPerXdlops == 64,
+                      "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
+
+        static_assert(mfma_type.num_threads_blk == mfma_type.n, "n != num_threads_blk");
+        static_assert(mfma_type.num_regs_blk * mfma_type.num_input_blks == mfma_type.m,
+                      "m != num_input_blks * num_regs_blk");
+        static_assert(mfma_type.num_output_blks == mfma_type.num_input_blks ||
+                          mfma_type.num_output_blks == 1,
+                      "incorrect num_output_blks");
+        static_assert(mfma_type.num_regs_blk * mfma_type.wave_size == mfma_type.m * mfma_type.n,
+                      "num_regs_blk incorrect");
+
+        static_assert(mfma_type.k % mfma_type.k_base == 0, "k % kbase != 0!");
+    }
+
+    __device__ static constexpr index_t GetRegSizePerXdlops()
+    {
+        return MPerXdlops * NPerXdlops / mfma_type.wave_size;
+    }
+
+    template <class ADesc,
+              class BDesc,
+              class CDesc,
+              index_t m0,
+              index_t n0,
+              class FloatA,
+              class FloatB,
+              class FloatC>
+    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
+    {
+        static_assert(is_same<base_type, float>::value || is_same<base_type, half_t>::value ||
+                          is_same<base_type, ushort>::value,
+                      "base base_type must be float, half, ushort!");
+
+        static_assert(KPack % mfma_type.k_base == 0, "KPack cannot be divided by k_base");
+
+        constexpr index_t c_offset = CDesc{}.CalculateOffset(make_tuple(m0, n0)) * GetNumXdlops();
+
+        static_for<0, KPack, mfma_type.k_base>{}([&](auto k) {
+            constexpr index_t a_offset = ADesc{}.CalculateOffset(make_tuple(0, m0, 0, k));
+            constexpr index_t b_offset = BDesc{}.CalculateOffset(make_tuple(0, n0, 0, k));
+
+            mfma_type.template run<MPerXdlops, NPerXdlops, c_offset>(
+                p_a_wave[Number<a_offset / mfma_type.k_base>{}],
+                p_b_wave[Number<b_offset / mfma_type.k_base>{}],
+                p_c_thread);
+        });
+    }
+
+    __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i)
+    {
+        const index_t laneId = get_thread_local_1d_id() % mfma_type.wave_size;
+        const index_t blk_id = laneId / mfma_type.num_threads_blk;
+        const index_t blk_td = laneId % mfma_type.num_threads_blk;
+
+        index_t n_offset = blk_i * mfma_type.n + blk_td;
+        index_t m_offset = xdlops_i * mfma_type.m + blk_id * mfma_type.group_size;
+
+        return CIndex{m_offset, n_offset};
+    }
+
+    static constexpr index_t MRepeats   = GetXdlopsInfo().MRepeats;
+    static constexpr index_t NRepeats   = GetXdlopsInfo().NRepeats;
+    static constexpr index_t MPerXdlops = GetXdlopsInfo().MPerXdlops;
+    static constexpr index_t NPerXdlops = GetXdlopsInfo().NPerXdlops;
+
+    static constexpr bool IsKReduction  = GetXdlopsInfo().IsKReduction();
+    static constexpr bool IsABroadcast  = GetXdlopsInfo().IsABroadcast();
+    static constexpr index_t KPerXdlops = GetXdlopsInfo().GetKPerXdlops();
+
+    static constexpr auto GetBlkId(const index_t lane_id)
+    {
+        return lane_id / mfma_type.num_threads_blk;
+    }
+
+    static constexpr auto GetBlkTd(const index_t lane_id)
+    {
+        return lane_id % mfma_type.num_threads_blk;
+    }
+
+    static constexpr auto mfma_type = GetXdlopsInfo().mfma_type;
+
+    struct CLayout
+    {
+        __host__ __device__ static constexpr index_t M1() { return mfma_type.num_groups_blk; }
+        __host__ __device__ static constexpr index_t M0() { return mfma_type.group_size; }
+        __host__ __device__ static constexpr index_t N1() { return mfma_type.num_input_blks; }
+        __host__ __device__ static constexpr index_t N0() { return mfma_type.num_threads_blk; }
+
+        __device__ static constexpr index_t GetBlkSize() { return mfma_type.num_regs_blk; }
+
+        __device__ static constexpr index_t GetNumBlks() { return mfma_type.num_output_blks; }
+
+        __device__ static constexpr index_t GetNumXdlops()
+        {
+            return MPerXdlops * NPerXdlops /
+                   (mfma_type.m * mfma_type.n * mfma_type.num_output_blks);
+        }
+    };
+
+    __host__ __device__ static constexpr auto GetCLayout() { return CLayout{}; }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
@@ -268,6 +268,7 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
        }
        else if constexpr(N == 8)
        {
+#if 0
            vector_type<half_t, 8> tmp;

            tmp.AsType<half4_t>()(Number<0>{}) = __llvm_amdgcn_raw_buffer_load_fp16x4(
@@ -280,6 +281,12 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource,
                                                     0);

            return tmp.AsType<half8_t>()(Number<0>{});
+#else
+            float4_t tmp = __llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return as_type<half8_t>(tmp);
+#endif
        }
    }
    else if constexpr(is_same<T, int32_t>::value)
--- a/composable_kernel/include/utility/amd_xdlops.hpp
+++ b/composable_kernel/include/utility/amd_xdlops.hpp
@@ -0,0 +1,499 @@
+#ifndef CK_AMD_XDLOPS_HPP
+#define CK_AMD_XDLOPS_HPP
+
+#include "float_type.hpp"
+
+namespace ck {
+
+// A, B, C, cbsz, abid, blgp
+extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+    float, float, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x1f32");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x2f32(
+    float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2f32");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x4f32(
+    float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f32");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x1f32(
+    float, float, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x1f32");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+    float, float, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x1f32");
+
+extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+    half4_t, half4_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4f16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x8f16(
+    half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x8f16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x16f16(
+    half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x16f16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x4f16(
+    half4_t, half4_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x4f16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+    half4_t, half4_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x4f16");
+
+extern "C" __device__ float32_t llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(
+    ushort2_t, ushort2_t, float32_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x2bf16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_32x32x4bf16(
+    ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.32x32x4bf16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_16x16x8bf16(
+    ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x8bf16");
+
+extern "C" __device__ float16_t llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(
+    ushort2_t, ushort2_t, float16_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.16x16x2bf16");
+
+extern "C" __device__ float4_t llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(
+    ushort2_t, ushort2_t, float4_t, int, int, int) __asm("llvm.amdgcn.mfma.f32.4x4x2bf16");
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x1f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x1f32<64, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                1,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x1f32<32, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x2f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x2f32<32, 32, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x4f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x4f32<16, 16, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x4f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x1f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x1f32<16, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                2,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_4x4x1f32;
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x1f32<4, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x1f32<8, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x1f32(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                1,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x4f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x4f16<64, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                1,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x4f16<32, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float32_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float32_t>()[Number<0>{}],
+                1,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_32x32x8f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_32x32x8f16<32, 32, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_32x32x8f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x16f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x16f16<16, 16, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x16f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_16x16x4f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_16x16x4f16<16, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float16_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_16x16x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float16_t>()[Number<0>{}],
+                2,
+                0,
+                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave, index_t COffset>
+struct intrin_mfma_f32_4x4x4f16;
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x4f16<4, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+    }
+};
+
+template <index_t COffset>
+struct intrin_mfma_f32_4x4x4f16<8, 64, COffset>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c(Number<COffset>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                0,
+                0);
+        reg_c(Number<COffset + 1>{}).template AsType<float4_t>()(Number<0>{}) =
+            llvm_intrin_amdgcn_mfma_f32_4x4x4f16(
+                reg_a,
+                reg_b,
+                reg_c[Number<COffset + 1>{}].template AsType<float4_t>()[Number<0>{}],
+                4,
+                1,
+                0);
+    }
+};
+
+#if 0
+template <index_t MPerWave, index_t NPerWave, index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16;
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<128, 64, AStride, BStride>
+{
+    __device__ static c_vec32_4_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_4_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0);
+
+        reg_c.s.z =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[AStride], reg_b[0], reg_c.s.z, 1, 0, 0);
+        reg_c.s.w =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[AStride], reg_b[0], reg_c.s.w, 1, 1, 0);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<64, 128, AStride, BStride>
+{
+    __device__ static c_vec32_4_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_4_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0);
+
+        reg_c.s.z =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[BStride], reg_c.s.z, 1, 0, 0);
+        reg_c.s.w =
+            llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[BStride], reg_c.s.w, 1, 1, 0);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<64, 64, AStride, BStride>
+{
+    __device__ static c_vec32_2_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_2_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 1, 1, 0);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<64, 32, AStride, BStride>
+{
+    __device__ static c_vec32_1_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_1_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 1);
+
+        return reg_c;
+    }
+};
+
+template <index_t AStride, index_t BStride>
+struct intrin_mfma_f32_32x32x2bf16<32, 64, AStride, BStride>
+{
+    __device__ static c_vec32_1_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec32_1_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 1, 0, 0);
+        return reg_c;
+    }
+};
+
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_32x32x4bf16(const ushort2_t* reg_a,
+                                                            const ushort2_t* reg_b,
+                                                            c_vec16_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_32x32x4bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 0);
+    return reg_c;
+}
+
+__device__ c_vec4_1_t::VecType intrin_mfma_f32_16x16x8bf16(const ushort2_t* reg_a,
+                                                           const ushort2_t* reg_b,
+                                                           c_vec4_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x8bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 0);
+    return reg_c;
+}
+
+template <index_t MPerWave, index_t NPerWave>
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16(const ushort2_t* reg_a,
+                                                            const ushort2_t* reg_b,
+                                                            c_vec16_1_t::VecType reg_c);
+
+template <>
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16<16, 64>(const ushort2_t* reg_a,
+                                                                    const ushort2_t* reg_b,
+                                                                    c_vec16_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 2, 0, 0);
+    return reg_c;
+}
+
+template <>
+__device__ c_vec16_1_t::VecType intrin_mfma_f32_16x16x2bf16<64, 16>(const ushort2_t* reg_a,
+                                                                    const ushort2_t* reg_b,
+                                                                    c_vec16_1_t::VecType reg_c)
+{
+    reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_16x16x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 0, 0, 4);
+    return reg_c;
+}
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_4x4x2bf16;
+
+template <>
+struct intrin_mfma_f32_4x4x2bf16<4, 64>
+{
+    __device__ static c_vec4_1_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec4_1_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 4, 0, 0);
+        return reg_c;
+    }
+};
+
+template <>
+struct intrin_mfma_f32_4x4x2bf16<8, 64>
+{
+    __device__ static c_vec4_2_t::VecType
+    run(const ushort2_t* reg_a, const ushort2_t* reg_b, c_vec4_2_t::VecType reg_c)
+    {
+        reg_c.s.x = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.x, 4, 0, 0);
+        reg_c.s.y = llvm_intrin_amdgcn_mfma_f32_4x4x2bf16(reg_a[0], reg_b[0], reg_c.s.y, 4, 1, 0);
+        return reg_c;
+    }
+};
+
+#endif
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -18,7 +18,7 @@
 #define CK_AMD_GPU_GFX906 1
 #elif 1
 #define CK_AMD_GPU_GFX908 1
-#elif 1
+#elif 0
 #define CK_AMD_GPU_GFX1030 1
 #endif

@@ -28,7 +28,7 @@
 #endif

 // launch bounds
-#define CK_USE_LAUNCH_BOUNDS 0
+#define CK_USE_LAUNCH_BOUNDS 1

 #ifdef CK_USE_LAUNCH_BOUNDS
 #define CK_MAX_THREAD_PER_BLOCK 256
@@ -116,7 +116,7 @@
 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 1

 // merge transformation use magic number division
-#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
+#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1

 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
--- a/composable_kernel/include/utility/container_helper.hpp
+++ b/composable_kernel/include/utility/container_helper.hpp
@@ -174,8 +174,15 @@ __host__ __device__ constexpr auto container_reduce(const Container& x,
 {
    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");

-    return container_reduce_impl(
-        x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+    if constexpr(IEnd > IBegin)
+    {
+        return container_reduce_impl(
+            x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return init;
+    }
 }
 #endif

--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -618,6 +618,252 @@ struct vector_type<T, 64>
    }
 };

+template <typename T>
+struct vector_type<T, 128>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+
+    using type = d128_t;
+
+    union
+    {
+        d128_t d128_;
+        StaticallyIndexedArray<d1_t, 128> d1x128_;
+        StaticallyIndexedArray<d2_t, 64> d2x64_;
+        StaticallyIndexedArray<d4_t, 32> d4x32_;
+        StaticallyIndexedArray<d8_t, 16> d8x16_;
+        StaticallyIndexedArray<d16_t, 8> d16x8_;
+        StaticallyIndexedArray<d32_t, 4> d32x4_;
+        StaticallyIndexedArray<d64_t, 2> d64x2_;
+        StaticallyIndexedArray<d128_t, 1> d128x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 256>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+    typedef T d256_t __attribute__((ext_vector_type(256)));
+
+    using type = d256_t;
+
+    union
+    {
+        d256_t d256_;
+        StaticallyIndexedArray<d1_t, 256> d1x256_;
+        StaticallyIndexedArray<d2_t, 128> d2x128_;
+        StaticallyIndexedArray<d4_t, 64> d4x64_;
+        StaticallyIndexedArray<d8_t, 32> d8x32_;
+        StaticallyIndexedArray<d16_t, 16> d16x16_;
+        StaticallyIndexedArray<d32_t, 8> d32x8_;
+        StaticallyIndexedArray<d64_t, 4> d64x4_;
+        StaticallyIndexedArray<d128_t, 2> d128x2_;
+        StaticallyIndexedArray<d256_t, 1> d256x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+    }
+};
+
 // fp32
 using float2_t  = typename vector_type<float, 2>::type;
 using float4_t  = typename vector_type<float, 4>::type;
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -9,25 +9,25 @@
 namespace ck {
 namespace math {

-template <class T, T s>
+template <typename T, T s>
 struct scales
 {
    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
 };

-template <class T>
+template <typename T>
 struct plus
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
 };

-template <class T>
+template <typename T>
 struct minus
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
 };

-template <class T>
+template <typename T>
 struct multiplies
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
@@ -42,83 +42,111 @@ struct multiplies_v2
    }
 };

-template <class T>
+template <typename T>
 struct maximize
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a >= b ? a : b; }
 };

-template <class T>
+template <typename T>
 struct minimize
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a <= b ? a : b; }
 };

-template <class T>
+template <typename T>
 struct integer_divide_ceiler
 {
    __host__ __device__ constexpr T operator()(T a, T b) const
    {
        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");

-        return (a + b - 1) / b;
+        return (a + b - Number<1>{}) / b;
    }
 };

-template <class X, class Y>
+template <typename X, typename Y>
 __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
 {
    return x / y;
 }

-template <class X, class Y>
+template <typename X, typename Y>
 __host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
 {
    return (x + y - Number<1>{}) / y;
 }

-template <class X, class Y>
+template <typename X, typename Y>
 __host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
 {
    return y * integer_divide_ceil(x, y);
 }

-template <class T>
+template <typename T>
 __host__ __device__ constexpr T max(T x)
 {
    return x;
 }

-template <class T, class... Ts>
-__host__ __device__ constexpr T max(T x, Ts... xs)
+template <typename T>
+__host__ __device__ constexpr T max(T x, T y)
 {
-    static_assert(sizeof...(xs) > 0, "not enough argument");
-
-    auto y = max(xs...);
-
-    static_assert(is_same<decltype(y), T>{}, "not the same type");
-
    return x > y ? x : y;
 }

-template <class T>
+template <index_t X>
+__host__ __device__ constexpr index_t max(Number<X>, index_t y)
+{
+    return X > y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t max(index_t x, Number<Y>)
+{
+    return x > Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto max(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return max(x, max(ys...));
+}
+
+template <typename T>
 __host__ __device__ constexpr T min(T x)
 {
    return x;
 }

-template <class T, class... Ts>
-__host__ __device__ constexpr T min(T x, Ts... xs)
+template <typename T>
+__host__ __device__ constexpr T min(T x, T y)
 {
-    static_assert(sizeof...(xs) > 0, "not enough argument");
-
-    auto y = min(xs...);
-
-    static_assert(is_same<decltype(y), T>{}, "not the same type");
-
    return x < y ? x : y;
 }

+template <index_t X>
+__host__ __device__ constexpr index_t min(Number<X>, index_t y)
+{
+    return X < y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t min(index_t x, Number<Y>)
+{
+    return x < Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto min(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return min(x, min(ys...));
+}
+
 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
 {
@@ -171,13 +199,13 @@ __host__ __device__ constexpr auto lcm(X x, Ys... ys)
    return lcm(x, lcm(ys...));
 }

-template <class T>
+template <typename T>
 struct equal
 {
    __host__ __device__ constexpr bool operator()(T x, T y) const { return x == y; }
 };

-template <class T>
+template <typename T>
 struct less
 {
    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -153,6 +153,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X

        return *this;
    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
 };

 template <typename... Xs>