diff --git a/CHANGELOG.md b/CHANGELOG.md
index 31f129b581..9d38a66382 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,18 @@
 
 Full documentation for Composable Kernel is not yet available.
 
+## (Unreleased) CK for ROCm 6.0.0
+
+### Fixed
+
+### Optimizations
+
+### Added
+- Added image to column (#867) and column to image kernels (#930).
+
+### Changed
+
+
 ## CK 0.2.0 for ROCm 5.5.0
 
 ### Fixed
@@ -29,4 +41,3 @@ Full documentation for Composable Kernel is not yet available.
 - Added MaxPool backward (#750).
 
 ### Changed
-- Changed ...
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b8e43b9df..f7a9d37bba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,14 +446,14 @@ if(NOT DEFINED INSTANCES_ONLY)
 
    rocm_package_setup_component(profiler
         LIBRARY_NAME composablekernel
-        PACKAGE_NAME ckProfiler
+        PACKAGE_NAME ckprofiler
    )
    add_subdirectory(profiler)
   else()
     #When building PROFILER_ONLY, label the package with GPU_ARCH
     rocm_package_setup_component(profiler
        LIBRARY_NAME composablekernel
-       PACKAGE_NAME ckProfiler_${GPU_ARCH}
+       PACKAGE_NAME ckprofiler_${GPU_ARCH}
     )
     add_subdirectory(profiler)
   endif()
diff --git a/client_example/20_image_to_column/CMakeLists.txt b/client_example/20_image_to_column/CMakeLists.txt
deleted file mode 100644
index 80edcd0416..0000000000
--- a/client_example/20_image_to_column/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_executable(client_image_to_column image_to_column.cpp)
-target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations)
diff --git a/client_example/22_im2col_col2im/CMakeLists.txt b/client_example/22_im2col_col2im/CMakeLists.txt
new file mode 100644
index 0000000000..47ac42fe87
--- /dev/null
+++ b/client_example/22_im2col_col2im/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(client_image_to_column image_to_column.cpp)
+target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations)
+
+add_executable(client_column_to_image column_to_image.cpp)
+target_link_libraries(client_column_to_image PRIVATE composable_kernel::device_operations)
diff --git a/client_example/22_im2col_col2im/column_to_image.cpp b/client_example/22_im2col_col2im/column_to_image.cpp
new file mode 100644
index 0000000000..43338ce408
--- /dev/null
+++ b/client_example/22_im2col_col2im/column_to_image.cpp
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+
+using ImageLayout = ck::tensor_layout::convolution::GNHWC;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 32; // batch size
+static constexpr ck::index_t C             = 32; // input channel (per group)
+static constexpr ck::index_t Y             = 3;  // filter H
+static constexpr ck::index_t X             = 3;  // filter W
+static constexpr ck::index_t Hi            = 28; // input H
+static constexpr ck::index_t Wi            = 28; // input W
+static constexpr ck::index_t Ho            = 28; // output H
+static constexpr ck::index_t Wo            = 28; // output W
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+
+    std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
+    std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
+    std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
+
+    // We have NHWGC in memory space (G is dummy)
+    // However, CK's API only accept length and stride with order of GNCHW
+    // Hence, we need to adjust the order of stride
+    std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);
+
+    using namespace ck::conv_tensor_rearrange_op;
+
+    using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
+                                                                             ImageLayout,
+                                                                             InDataType,
+                                                                             OutDataType,
+                                                                             ColumnToImage>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        image_strides,
+                                                        gemm_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads);
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
+                                    sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
+
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(avg_time < best_avg_time)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        N,
+                                                        C,
+                                                        in_spatial_lengths,
+                                                        out_spatial_lengths,
+                                                        wei_spatial_lengths,
+                                                        image_strides,
+                                                        gemm_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+}
diff --git a/client_example/20_image_to_column/image_to_column.cpp b/client_example/22_im2col_col2im/image_to_column.cpp
similarity index 84%
rename from client_example/20_image_to_column/image_to_column.cpp
rename to client_example/22_im2col_col2im/image_to_column.cpp
index ace4c1a681..a1447abf64 100644
--- a/client_example/20_image_to_column/image_to_column.cpp
+++ b/client_example/22_im2col_col2im/image_to_column.cpp
@@ -9,13 +9,14 @@
 #include <vector>
 
 #include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 
 using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 
-using InLayout = ck::tensor_layout::convolution::GNHWC;
+using ImageLayout = ck::tensor_layout::convolution::GNHWC;
 
 static constexpr ck::index_t NumDimSpatial = 2;
 static constexpr ck::index_t G             = 1;
@@ -54,8 +55,8 @@ int main()
     // We have NHWGC in memory space (G is dummy)
     // However, CK's API only accept length and stride with order of GNCHW
     // Hence, we need to adjust the order of stride
-    std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
-    std::array<ck::index_t, 2> out_strides{Y * X * C, 1};
+    std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
+    std::array<ck::index_t, 2> gemm_strides{Y * X * C, 1};
 
     std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
     std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
@@ -65,8 +66,13 @@ int main()
     SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
     SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
 
-    using DeviceOp = ck::tensor_operation::device::
-        DeviceImageToColumn<NumDimSpatial, InLayout, InDataType, OutDataType>;
+    using namespace ck::conv_tensor_rearrange_op;
+
+    using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
+                                                                             ImageLayout,
+                                                                             InDataType,
+                                                                             OutDataType,
+                                                                             ImageToColumn>;
 
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -92,8 +98,8 @@ int main()
                                                         in_spatial_lengths,
                                                         out_spatial_lengths,
                                                         wei_spatial_lengths,
-                                                        in_strides,
-                                                        out_strides,
+                                                        image_strides,
+                                                        gemm_strides,
                                                         filter_strides,
                                                         filter_dilations,
                                                         input_left_pads,
@@ -148,8 +154,8 @@ int main()
                                                         in_spatial_lengths,
                                                         out_spatial_lengths,
                                                         wei_spatial_lengths,
-                                                        in_strides,
-                                                        out_strides,
+                                                        image_strides,
+                                                        gemm_strides,
                                                         filter_strides,
                                                         filter_dilations,
                                                         input_left_pads,
diff --git a/example/52_image_to_column/CMakeLists.txt b/example/52_im2col_col2im/CMakeLists.txt
similarity index 50%
rename from example/52_image_to_column/CMakeLists.txt
rename to example/52_im2col_col2im/CMakeLists.txt
index 226e1fc5ae..a2dec9e805 100644
--- a/example/52_image_to_column/CMakeLists.txt
+++ b/example/52_im2col_col2im/CMakeLists.txt
@@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
  if(gpu IN_LIST gpu_list AND target EQUAL 0)
-   add_custom_target(example_image_to_column)
+   add_custom_target(example_im2col_col2im)
    add_example_executable(example_image_to_column_f32 image_to_column_f32.cpp)
-   add_dependencies(example_image_to_column example_image_to_column_f32)
+   add_dependencies(example_im2col_col2im example_image_to_column_f32)
+   add_example_executable(example_column_to_image_f32 column_to_image_f32.cpp)
+   add_dependencies(example_im2col_col2im example_column_to_image_f32)
    set(target 1)
  endif()
 endforeach()
diff --git a/example/52_im2col_col2im/column_to_image_f32.cpp b/example/52_im2col_col2im/column_to_image_f32.cpp
new file mode 100644
index 0000000000..52144e6885
--- /dev/null
+++ b/example/52_im2col_col2im/column_to_image_f32.cpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType  = FP32; // ck::bhalf_t;//FP32;
+using OutDataType = FP32; // ck::bhalf_t;//FP32;
+
+using ImLayout        = ck::tensor_layout::convolution::GNHWC;
+using ColumnToImageOp = ck::conv_tensor_rearrange_op::ColumnToImage;
+
+// clang-format off
+using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImageImpl
+        //#####################|        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+                              < NDimSpatial, ImLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,     1>;
+// clang-format on
+
+bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
+{
+
+    const auto N = conv_params.N_;
+    const auto C = conv_params.C_;
+
+    const ck::index_t NDoHoWo =
+        N * ck::accumulate_n<ck::index_t>(
+                conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+    const ck::index_t CZYX =
+        C * ck::accumulate_n<ck::index_t>(
+                conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+    const auto in_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+    const auto out_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
+
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> gemm_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+    copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
+    copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
+    copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
+    copy(in_desc.GetStrides(), gemm_m_k_strides);
+    copy(out_desc.GetStrides(), image_g_n_c_wis_strides);
+    copy(conv_params.conv_filter_strides_, conv_filter_strides);
+    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_params.input_left_pads_, input_left_pads);
+    copy(conv_params.input_right_pads_, input_right_pads);
+
+    Tensor<InDataType> in(in_desc);
+    Tensor<OutDataType> out_device(out_desc);
+    Tensor<OutDataType> out_host(out_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "out: " << out_device.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 2}); break;
+    default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+
+    // reset input to zero
+    out_device_buf.SetZero();
+
+    static_assert(std::is_default_constructible_v<DeviceColToImgInstance>);
+
+    // do conv
+    auto col2img  = DeviceColToImgInstance{};
+    auto invoker  = col2img.MakeInvoker();
+    auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                         out_device_buf.GetDeviceBuffer(),
+                                         N,
+                                         C,
+                                         input_spatial_lengths,
+                                         filter_spatial_lengths,
+                                         output_spatial_lengths,
+                                         image_g_n_c_wis_strides,
+                                         gemm_m_k_strides,
+                                         conv_filter_strides,
+                                         conv_filter_dilations,
+                                         input_left_pads,
+                                         input_right_pads);
+
+    if(!col2img.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_col2img with the specified compilation parameters does "
+                     "not support this col2img problem"
+                  << std::endl;
+
+        return false;
+    }
+
+    float ave_time        = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+    std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType));
+    float gb_per_sec      = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_column_to_image = ck::tensor_operation::host::
+            ReferenceColumnToImage<NDimSpatial, ImLayout, InDataType, OutDataType>();
+
+        auto ref_invoker = ref_column_to_image.MakeInvoker();
+
+        auto ref_argument = ref_column_to_image.MakeArgument(in,
+                                                             out_host,
+                                                             conv_params.filter_spatial_lengths_,
+                                                             conv_params.conv_filter_strides_,
+                                                             conv_params.conv_filter_dilations_,
+                                                             conv_params.input_left_pads_,
+                                                             conv_params.input_right_pads_);
+
+        if(!ref_column_to_image.IsSupportedArgument(&ref_argument))
+        {
+            std::cerr << "wrong! ref_col2img with the specified compilation parameters does "
+                         "not support this col2img problem"
+                      << std::endl;
+            return false;
+        }
+
+        ref_invoker.Run(ref_argument);
+        out_device_buf.FromDevice(out_device.mData.data());
+        return ck::utils::check_err(out_device.mData, out_host.mData);
+    }
+
+    return true;
+}
+
+int RunColumnToImageExample(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_params = DefaultConvParams;
+
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatial dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return !RunColumnToImage(config, conv_params);
+}
+
+int main(int argc, char* argv[]) { return RunColumnToImageExample(argc, argv); }
diff --git a/example/52_image_to_column/common.hpp b/example/52_im2col_col2im/common.hpp
similarity index 94%
rename from example/52_image_to_column/common.hpp
rename to example/52_im2col_col2im/common.hpp
index 8510fa1e6d..61d30c4cb4 100644
--- a/example/52_image_to_column/common.hpp
+++ b/example/52_im2col_col2im/common.hpp
@@ -10,6 +10,7 @@
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 
 #include "ck/library/utility/algorithm.hpp"
@@ -20,6 +21,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp"
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -32,7 +34,7 @@ struct ExecutionConfig final
 {
     bool do_verification = true;
     int init_method      = 1;
-    bool time_kernel     = true;
+    bool time_kernel     = false;
 };
 
 #define DefaultConvParams                                                            \
diff --git a/example/52_image_to_column/image_to_column_f32.cpp b/example/52_im2col_col2im/image_to_column_f32.cpp
similarity index 89%
rename from example/52_image_to_column/image_to_column_f32.cpp
rename to example/52_im2col_col2im/image_to_column_f32.cpp
index c8a7e5f221..6d883460cc 100644
--- a/example/52_image_to_column/image_to_column_f32.cpp
+++ b/example/52_im2col_col2im/image_to_column_f32.cpp
@@ -6,15 +6,16 @@
 using InDataType  = FP32;
 using OutDataType = FP32;
 
-using InLayout = ck::tensor_layout::convolution::GNHWC;
+using ImLayout        = ck::tensor_layout::convolution::GNHWC;
+using ImageToColumnOp = ck::conv_tensor_rearrange_op::ImageToColumn;
 
 // clang-format off
 using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
         //#####################|           |         |           |            |      |      |      |          |       |
-                              < NDimSpatial, InLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,     1>;
+                              < NDimSpatial, ImLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,      1>;
 // clang-format on
 
 bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)
@@ -31,14 +32,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
                 conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
 
     const auto in_desc =
-        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
     const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
 
     std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
     std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
     std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
-    std::array<ck::index_t, 2> output_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> gemm_m_k_strides{};
     std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
     std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
     std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -49,8 +50,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
     copy(conv_params.input_spatial_lengths_, input_spatial_lengths);
     copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths);
     copy(conv_params.output_spatial_lengths_, output_spatial_lengths);
-    copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
-    copy(out_desc.GetStrides(), output_m_k_strides);
+    copy(in_desc.GetStrides(), image_g_n_c_wis_strides);
+    copy(out_desc.GetStrides(), gemm_m_k_strides);
     copy(conv_params.conv_filter_strides_, conv_filter_strides);
     copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
     copy(conv_params.input_left_pads_, input_left_pads);
@@ -90,8 +91,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
                                          input_spatial_lengths,
                                          filter_spatial_lengths,
                                          output_spatial_lengths,
-                                         input_g_n_c_wis_strides,
-                                         output_m_k_strides,
+                                         image_g_n_c_wis_strides,
+                                         gemm_m_k_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,
@@ -114,7 +115,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv
     if(config.do_verification)
     {
         auto ref_image_to_column = ck::tensor_operation::host::
-            ReferenceImageToColumn<NDimSpatial, InLayout, InDataType, OutDataType>();
+            ReferenceImageToColumn<NDimSpatial, ImLayout, InDataType, OutDataType>();
 
         auto ref_invoker = ref_image_to_column.MakeInvoker();
 
diff --git a/example/60_gemm_multiABD/CMakeLists.txt b/example/60_gemm_multiABD/CMakeLists.txt
new file mode 100644
index 0000000000..9e2f70649f
--- /dev/null
+++ b/example/60_gemm_multiABD/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list2 AND target EQUAL 0)
+     add_example_executable(example_gemm_multiABD_xdl_fp16 gemm_multiABD_xdl_fp16.cpp)
+   set(target 1)
+ endif()
+endforeach()
+endif()
diff --git a/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp b/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp
new file mode 100644
index 0000000000..23d41d7cc4
--- /dev/null
+++ b/example/60_gemm_multiABD/gemm_multiABD_xdl_fp16.cpp
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+struct AddScale
+{
+    static constexpr auto I0 = ck::Number<0>{};
+    static constexpr auto I1 = ck::Number<1>{};
+    static constexpr auto I2 = ck::Number<2>{};
+    static constexpr auto I3 = ck::Number<3>{};
+
+    __host__ __device__ constexpr void
+    operator()(ck::half4_t& a, const ck::half4_t& a0, const ck::half4_t& a1) const
+    {
+        const auto a0_v_t = ck::vector_type<ck::half_t, 4>{a0};
+        const auto a1_v_t = ck::vector_type<ck::half_t, 4>{a1};
+
+        auto r_v_t = ck::vector_type<ck::half_t, 4>{};
+
+        r_v_t.AsType<ck::half_t>()(I0) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I0] + a1_v_t.AsType<ck::half_t>()[I0]);
+        r_v_t.AsType<ck::half_t>()(I1) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I1] + a1_v_t.AsType<ck::half_t>()[I1]);
+        r_v_t.AsType<ck::half_t>()(I2) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I2] + a1_v_t.AsType<ck::half_t>()[I2]);
+        r_v_t.AsType<ck::half_t>()(I3) =
+            scale * (a0_v_t.AsType<ck::half_t>()[I3] + a1_v_t.AsType<ck::half_t>()[I3]);
+
+        a = r_v_t.AsType<ck::half4_t>()[I0];
+    }
+
+    __host__ __device__ constexpr void
+    operator()(ck::half_t& a, const ck::half_t& a0, const ck::half_t& a1) const
+    {
+        a = scale * (a0 + a1);
+    }
+
+    static constexpr ck::index_t vec_len = 4;
+
+    float scale = 1.0;
+};
+
+struct AlphaBetaAdd
+{
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename C, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
+        ck::half_t& e, const float& c, const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
+    };
+
+    float alpha_;
+    float beta_;
+};
+
+using AElementOp   = AddScale;
+using BElementOp   = PassThrough;
+using CDEElementOp = AlphaBetaAdd;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleABD_Xdl_CShuffle<
+    ck::Tuple<ALayout, ALayout>,
+    ck::Tuple<BLayout>,
+    ck::Tuple<DLayout>,
+    ELayout,
+    ck::Tuple<ADataType, ADataType>,
+    ck::Tuple<BDataType>,
+    AccDataType,
+    CShuffleDataType,
+    ck::Tuple<DDataType>,
+    EDataType,
+    AElementOp,
+    BElementOp,
+    CDEElementOp,
+    GemmSpec,
+    1,
+    256,
+    256,
+    128,
+    32,
+    8,
+    8,
+    32,
+    32,
+    4,
+    2,
+    S<4, 64, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    1,
+    S<4, 64, 1>,
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    1,
+    1,
+    1,
+    S<1, 32, 1, 8>,
+    8>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideD = 4096;
+    ck::index_t StrideE = 4096;
+
+    float alpha = 1.0f;
+    float beta  = 1.0f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        alpha = std::stof(argv[4]);
+        beta  = std::stof(argv[5]);
+    }
+    else if(argc == 13)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        alpha = std::stof(argv[11]);
+        beta  = std::stof(argv[12]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
+               "beta\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<ADataType> a1_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a0_device_buf(sizeof(ADataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem a1_device_buf(sizeof(ADataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{0.2};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(std::array<const void*, 2>{a0_device_buf.GetDeviceBuffer(),
+                                                          a1_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, 1>{b_device_buf.GetDeviceBuffer()},
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               std::array<ck::index_t, 2>{StrideA, StrideA},
+                               std::array<ck::index_t, 1>{StrideB},
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        Tensor<ADataType> a_m_k({M, K});
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int k = 0; k < K; ++k)
+            {
+                a_element_op(a_m_k(m, k), a0_m_k(m, k), a1_m_k(m, k));
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
index 3d27103dcb..df311620e1 100644
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -34,6 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
 #endif
         // warm up
         kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
 
         const int nrepeat = 10;
 #if DEBUG_LOG
@@ -50,6 +51,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
         for(int i = 0; i < nrepeat; ++i)
         {
             kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
         }
 
         hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -64,11 +66,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
     else
     {
         kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
 
         return 0;
     }
 #else
     kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());
 
     return 0;
 #endif
@@ -101,6 +105,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
         // warm up
         preprocess();
         kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
 
         const int nrepeat = 10;
 #if DEBUG_LOG
@@ -118,6 +123,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
         {
             preprocess();
             kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
         }
 
         hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -133,11 +139,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
     {
         preprocess();
         kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
 
         return 0;
     }
 #else
     kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());
 
     return 0;
 #endif
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
new file mode 100644
index 0000000000..1a9bb3213a
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7r2
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers>
+    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_descs, src_bufs);
+        }
+    }
+
+    template <typename T>
+    using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+    template <typename DstBuffers>
+    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            if constexpr(is_detected<is_tuple, decltype(dst_bufs)>::value)
+                threadwise_transfer_.RunWrite(dst_descs, dst_bufs);
+            else
+                threadwise_transfer_.RunWrite(dst_descs, tie(dst_bufs));
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs);
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs, const Index& step)
+    {
+        static_for<0, SrcDescs::Size(), 1>{}(
+            [&](auto i) { MoveSrcSliceWindow(src_descs, i, step); });
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs, const Index& step)
+    {
+        static_for<0, DstDescs::Size(), 1>{}(
+            [&](auto i) { MoveDstSliceWindow(dst_descs, i, step); });
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7r2<SrcDatas,
+                                           DstDatas,
+                                           SrcDescs,
+                                           DstDescs,
+                                           ElementwiseOperation,
+                                           DstInMemOps,
+                                           decltype(thread_slice_lengths),
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                           ThreadTransferDstResetCoordinateAfterRunFlags>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
new file mode 100644
index 0000000000..dc08a2c88b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace conv_tensor_rearrange_op {
+
+struct BaseConvTensorRearrangeOp
+{
+};
+
+struct ImageToColumn : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Image to Column";
+};
+
+struct ColumnToImage : public BaseConvTensorRearrangeOp
+{
+    static constexpr const char* name = "Column to Image";
+};
+
+template <typename Op,
+          typename std::enable_if<std::is_base_of<BaseConvTensorRearrangeOp, Op>::value,
+                                  bool>::type = false>
+std::ostream& operator<<(std::ostream& os, const BaseConvTensorRearrangeOp&)
+{
+    os << Op::name;
+    return os;
+}
+
+} // namespace conv_tensor_rearrange_op
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
similarity index 74%
rename from include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
rename to include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
index 631d5189dd..898cfe0f2c 100644
--- a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp
@@ -12,21 +12,26 @@ namespace tensor_operation {
 namespace device {
 
 /**
- * \brief Image to column.
+ * \brief Convolution Tensor Rearrange.
  *
- * This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm
- * problem([N * Do * Ho * Wo, Z *  Y * X * C]). G must be equal to 1.
+ * This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to
+ * the gemm problem([N * Do * Ho * Wo, Z *  Y * X * C]) (Image to Column) and
+ * conversion gemm form to the image (Column to Image).
+ *
+ * Note that G must be equal to 1.
  *
  * \tparam NDimSpatial Number of spatial dimensions.
- * \tparam InputLayout Input Layout.
+ * \tparam ImageLayout Input Layout.
  * \tparam InputDataType Input Data Type.
  * \tparam OutputDataType Output Data Type.
+ * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage.
  */
 template <index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
           typename InputDataType,
-          typename OutputDataType>
-struct DeviceImageToColumn : public BaseOperator
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceConvTensorRearrange : public BaseOperator
 {
 
     /**
@@ -39,8 +44,8 @@ struct DeviceImageToColumn : public BaseOperator
      * \param input_spatial_lengths Input spatial lengths.
      * \param filter_spatial_lengths Filter spatial lengths.
      * \param output_spatial_lengths Output spatial lengths.
-     * \param input_g_n_c_wis_strides Input strides in order [G, N, C, D, H, W].
-     * \param output_m_k_strides Output strides.
+     * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W].
+     * \param gemm_m_k_strides Gemm form strides.
      * \param conv_filter_strides Convolution filter strides.
      * \param conv_filter_dilations Convolution filter dilations.
      * \param input_left_pads Convolution left pads.
@@ -55,8 +60,8 @@ struct DeviceImageToColumn : public BaseOperator
                         const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                         const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                         const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& gemm_m_k_strides,
                         const std::array<index_t, NDimSpatial>& conv_filter_strides,
                         const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                         const std::array<index_t, NDimSpatial>& input_left_pads,
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
new file mode 100644
index 0000000000..cbb9fadc6d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A0[M, K], B0[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleABD : public BaseOperator
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
new file mode 100644
index 0000000000..f8b4a01681
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp
@@ -0,0 +1,621 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Image to column for input layout NDHWC:
+//   input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C]
+//   output : image [N, Di, Hi, Wi, C]
+template <index_t NDimSpatial,
+          typename ImageLayout,
+          typename InputDataType,
+          typename OutputDataType,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t KPerBlock,
+          typename ThreadClusterLengths,
+          index_t ScalarPerVector,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct DeviceColumnToImageImpl
+    : public DeviceConvTensorRearrange<NDimSpatial,
+                                       ImageLayout,
+                                       InputDataType,
+                                       OutputDataType,
+                                       conv_tensor_rearrange_op::ColumnToImage>
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto ZIdx = Number<I0>{};
+    static constexpr auto YIdx = NDimSpatial == 1 ? I0 : Number<NDimSpatial - I2>{};
+    static constexpr auto XIdx = Number<NDimSpatial - I1>{};
+
+    static constexpr auto spatial_offset = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>{};
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpecialization::MKPadding, index_t, index_t, index_t>{
+            MPerBlock, 0 /* NPerBlock*/, KPerBlock};
+
+    // Calculate number of independent filters for given conv params
+    static index_t GetNumberOfIndependentFilters(const index_t input_spatial_len,
+                                                 const index_t left_pad,
+                                                 const index_t right_pad,
+                                                 const index_t filter_len,
+                                                 const index_t filter_stride,
+                                                 const index_t filter_dilation,
+                                                 const index_t image_offset)
+    {
+        const index_t x_eff = (filter_len - 1) * filter_dilation + 1;
+        const index_t next_filter_padded =
+            math::integer_divide_ceil(x_eff, filter_stride) * filter_stride;
+        // If filter_stride >= x_eff then each filter is independent
+        const index_t independent_filter_stride =
+            filter_stride >= x_eff ? filter_stride : next_filter_padded;
+        const index_t w_eff = input_spatial_len - image_offset + left_pad + right_pad - x_eff;
+        // There are no independent filters
+        if(w_eff < 0)
+            return 0;
+        const index_t independent_kernels_num = w_eff / independent_filter_stride + 1;
+        return independent_kernels_num;
+    }
+
+    // Make column form descriptor
+    static auto
+    MakeInputDescriptor_M_K(const ck::index_t N,
+                            const ck::index_t C,
+                            const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                            const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                            const std::array<index_t, 2>& gemm_m_k_strides,
+                            const std::array<index_t, NDimSpatial>& independent_filters,
+                            const std::array<index_t, NDimSpatial>& effs)
+    {
+        const index_t DoHoWo = ck::accumulate_n<index_t>(
+            output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
+        const index_t CZYX =
+            C * ck::accumulate_n<index_t>(
+                    filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
+
+        const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1];
+        // Calculate the appropriate stride for each set of independent filters
+        // in each dimension
+        const index_t WStride =
+            math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0];
+        const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) *
+                                output_spatial_lengths[XIdx] * gemm_m_k_strides[I0];
+        const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) *
+                                output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] *
+                                gemm_m_k_strides[I0];
+        // Create descriptor for independent filters in each dimension and
+        // then merge them into column form
+        if constexpr(NDimSpatial == 1)
+        {
+            const auto desc_gemm_form =
+                make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX),
+                                             make_tuple(NStride, WStride, gemm_m_k_strides[I1]));
+            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
+                desc_gemm_form,
+                make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])),
+                           make_pass_through_transform(CZYX)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters);
+            return desc_m_k;
+        }
+        else if constexpr(NDimSpatial == 2)
+        {
+            const auto desc_gemm_form = make_naive_tensor_descriptor(
+                make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX),
+                make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1]));
+            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
+                desc_gemm_form,
+                make_tuple(make_merge_transform(
+                               make_tuple(N, independent_filters[YIdx], independent_filters[XIdx])),
+                           make_pass_through_transform(CZYX)),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters);
+            return desc_m_k;
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            const auto desc_gemm_form = make_naive_tensor_descriptor(
+                make_tuple(N,
+                           independent_filters[ZIdx],
+                           independent_filters[YIdx],
+                           independent_filters[XIdx],
+                           CZYX),
+                make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1]));
+            const auto desc_gemm_form_merged_filters = transform_tensor_descriptor(
+                desc_gemm_form,
+                make_tuple(make_merge_transform(make_tuple(N,
+                                                           independent_filters[ZIdx],
+                                                           independent_filters[YIdx],
+                                                           independent_filters[XIdx])),
+                           make_pass_through_transform(CZYX)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters);
+            return desc_m_k;
+        }
+    }
+
+    // Use MakeADescriptor_M_K from grouped convolution forward
+    static auto
+    MakeOutDescriptor_M_K(const ck::index_t N,
+                          const ck::index_t C,
+                          const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                          const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                          const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                          const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                          const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                          const std::array<index_t, NDimSpatial>& input_left_pads,
+                          const std::array<index_t, NDimSpatial>& input_right_pads,
+                          const std::array<index_t, NDimSpatial>& image_offsets,
+                          const std::array<index_t, NDimSpatial>& independent_filters,
+                          const std::array<index_t, NDimSpatial>& effs)
+    {
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{1};
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{1};
+        std::array<index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{1};
+
+        auto copy = [](const auto& x, auto& y, index_t dst_offset) {
+            std::copy(x.begin(), x.end(), y.begin() + dst_offset);
+        };
+
+        copy(input_spatial_lengths, a_g_n_c_wis_lengths, spatial_offset);
+        copy(filter_spatial_lengths, b_g_k_c_xs_lengths, spatial_offset);
+        // Calculate descriptor only for independent filters
+        copy(independent_filters, c_g_n_k_wos_lengths, spatial_offset);
+
+        // fill only significant values (C and N)
+        a_g_n_c_wis_lengths[I1] = N;
+        a_g_n_c_wis_lengths[I2] = C;
+        b_g_k_c_xs_lengths[I2]  = C;
+        c_g_n_k_wos_lengths[I1] = N;
+
+        // Modify pads to apply offsets
+        std::array<index_t, NDimSpatial> input_left_pads_with_offset;
+        for(index_t i = 0; i < NDimSpatial; i++)
+        {
+            input_left_pads_with_offset[i] = math::max(0, input_left_pads[i] - image_offsets[i]);
+        }
+        // Modify input spatial lengths to apply offsets
+        for(index_t i = 0; i < NDimSpatial; i++)
+        {
+            a_g_n_c_wis_lengths[i + spatial_offset] -=
+                math::max(0, image_offsets[i] - input_left_pads[i]);
+        }
+
+        // Strides to next independent filters
+        std::array<index_t, NDimSpatial> independent_filter_strides;
+        for(index_t i = 0; i < NDimSpatial; i++)
+        {
+            index_t independent_filter_stride =
+                math::integer_divide_ceil(effs[i], conv_filter_strides[i]) * conv_filter_strides[i];
+            // If conv stride is greater than whole filter size, use conv stride
+            independent_filter_strides[i] = conv_filter_strides[i] >= effs[i]
+                                                ? conv_filter_strides[i]
+                                                : independent_filter_stride;
+        }
+
+        // Calculate image form descriptor for the modified convolution problem
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ImageLayout>(
+                a_g_n_c_wis_lengths,
+                image_g_n_c_wis_strides,
+                b_g_k_c_xs_lengths,
+                {}, // not needed for A Descriptor
+                c_g_n_k_wos_lengths,
+                {}, // not needed for A Descriptor
+                // conv_filter_strides,
+                independent_filter_strides,
+                conv_filter_dilations,
+                input_left_pads_with_offset,
+                input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+        return in_gemmm_gemmk_desc;
+    }
+
+    using InputGridDesc =
+        remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}))>;
+    using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(
+        1, 1, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+
+    using Block2ETileMap = remove_cvref_t<
+        decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
+            InputGridDesc{}))>;
+
+    using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
+                                                                  InputDataType,
+                                                                  OutputGridDesc,
+                                                                  OutputDataType,
+                                                                  BlockSize,
+                                                                  MPerBlock,
+                                                                  KPerBlock,
+                                                                  ThreadClusterLengths,
+                                                                  ScalarPerVector,
+                                                                  InMemoryDataOperationEnum::Add,
+                                                                  Block2ETileMap>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_in, // input image
+                 void* p_out,      // output image
+                 const ck::index_t N,
+                 const ck::index_t C,
+                 const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                 const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                 const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                 const std::array<index_t, 2>& gemm_m_k_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads)
+            : C_(C),
+              X_(filter_spatial_lengths[NDimSpatial - I1]),
+              p_in_{static_cast<const InputDataType*>(p_in)},
+              p_out_{static_cast<OutputDataType*>(p_out)},
+              image_g_n_c_wis_strides_{image_g_n_c_wis_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const index_t x_eff =
+                (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1;
+            const index_t y_eff =
+                NDimSpatial < 2
+                    ? I1
+                    : (filter_spatial_lengths[YIdx] - 1) * conv_filter_dilations[YIdx] + 1;
+            const index_t z_eff =
+                NDimSpatial < 3
+                    ? I1
+                    : (filter_spatial_lengths[ZIdx] - 1) * conv_filter_dilations[ZIdx] + 1;
+
+            // Iterate over sets of independent filters
+            for(int z_img_offset = 0; z_img_offset < z_eff;
+                z_img_offset += conv_filter_strides[ZIdx])
+            {
+                for(int y_img_offset = 0; y_img_offset < y_eff;
+                    y_img_offset += conv_filter_strides[YIdx])
+                {
+                    for(int x_img_offset = 0; x_img_offset < x_eff;
+                        x_img_offset += conv_filter_strides[XIdx])
+                    {
+
+                        std::array<index_t, NDimSpatial> image_offsets;
+                        std::array<index_t, NDimSpatial> effs;
+                        // Calculate the starting offset for a given set of
+                        // independent filters
+                        if constexpr(NDimSpatial == 1)
+                        {
+                            image_offsets = {x_img_offset};
+                            effs          = {x_eff};
+                        }
+                        if constexpr(NDimSpatial == 2)
+                        {
+                            image_offsets = {y_img_offset, x_img_offset};
+                            effs          = {y_eff, x_eff};
+                        }
+                        else if constexpr(NDimSpatial == 3)
+                        {
+                            image_offsets = {z_img_offset, y_img_offset, x_img_offset};
+                            effs          = {z_eff, y_eff, x_eff};
+                        }
+
+                        std::array<index_t, NDimSpatial> independent_filters;
+                        for(index_t i = 0; i < NDimSpatial; i++)
+                        {
+                            independent_filters[i] =
+                                GetNumberOfIndependentFilters(input_spatial_lengths[i],
+                                                              input_left_pads[i],
+                                                              input_right_pads[i],
+                                                              filter_spatial_lengths[i],
+                                                              conv_filter_strides[i],
+                                                              conv_filter_dilations[i],
+                                                              image_offsets[i]);
+                        }
+                        const index_t independent_filters_acum = ck::accumulate_n<index_t>(
+                            independent_filters.begin(), NDimSpatial, 1, std::multiplies<>());
+                        if(independent_filters_acum <= 0)
+                            continue;
+
+                        const auto in_grid_desc_m_k =
+                            MakeInputDescriptor_M_K(N,
+                                                    C,
+                                                    filter_spatial_lengths,
+                                                    output_spatial_lengths,
+                                                    conv_filter_strides,
+                                                    gemm_m_k_strides,
+                                                    independent_filters,
+                                                    effs);
+                        const auto out_grid_desc_m_k =
+                            MakeOutDescriptor_M_K(N,
+                                                  C,
+                                                  input_spatial_lengths,
+                                                  filter_spatial_lengths,
+                                                  image_g_n_c_wis_strides,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  image_offsets,
+                                                  independent_filters,
+                                                  effs);
+                        in_grid_desc_m_k_container_.push_back(in_grid_desc_m_k);
+                        out_grid_desc_m_k_container_.push_back(out_grid_desc_m_k);
+
+                        const index_t x_idx = x_img_offset / conv_filter_strides[XIdx];
+                        const index_t y_idx = y_img_offset / conv_filter_strides[YIdx];
+                        const index_t z_idx = z_img_offset / conv_filter_strides[ZIdx];
+
+                        const index_t x_offset_with_pad =
+                            math::max(0, x_img_offset - input_left_pads[XIdx]);
+                        const index_t y_offset_with_pad =
+                            math::max(0, y_img_offset - input_left_pads[YIdx]);
+                        const index_t z_offset_with_pad =
+                            math::max(0, z_img_offset - input_left_pads[ZIdx]);
+
+                        // Memory offsets to next set of independent filters,
+                        // move to independent filters in each dimension
+                        const index_t in_offset =
+                            x_idx * gemm_m_k_strides[0] +
+                            y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] +
+                            z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] *
+                                output_spatial_lengths[XIdx];
+                        // Move to independent filters in appropriate dimensions
+                        const index_t out_offset =
+                            x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] +
+                            y_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + YIdx] +
+                            z_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + ZIdx];
+
+                        const InputDataType* p_in_with_offset =
+                            static_cast<const InputDataType*>(p_in) + in_offset;
+                        OutputDataType* p_out_with_offset =
+                            static_cast<OutputDataType*>(p_out) + out_offset;
+                        p_in_container_.push_back(p_in_with_offset);
+                        p_out_container_.push_back(p_out_with_offset);
+                    }
+                }
+            }
+        }
+
+        void Print() const
+        {
+            for(std::size_t i = 0; i < in_grid_desc_m_k_container_.size(); i++)
+            {
+                std::cout << in_grid_desc_m_k_container_[i] << std::endl;
+                std::cout << out_grid_desc_m_k_container_[i] << std::endl;
+            }
+        }
+
+        const ck::index_t C_;
+        const ck::index_t X_;
+
+        const InputDataType* p_in_;
+        OutputDataType* p_out_;
+
+        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides_;
+        const std::array<index_t, NDimSpatial>& conv_filter_strides_;
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations_;
+        const std::array<index_t, NDimSpatial>& input_left_pads_;
+        const std::array<index_t, NDimSpatial>& input_right_pads_;
+
+        std::vector<InputGridDesc> in_grid_desc_m_k_container_;
+        std::vector<OutputGridDesc> out_grid_desc_m_k_container_;
+
+        std::vector<const InputDataType*> p_in_container_;
+        std::vector<OutputDataType*> p_out_container_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            float elapsed_time = 0.f;
+            const auto kernel  = kernel_tensor_rearrange<InputGridDesc,
+                                                        InputDataType,
+                                                        OutputGridDesc,
+                                                        OutputDataType,
+                                                        Block2ETileMap,
+                                                        GridwiseTensorRearrangeKernel>;
+
+            // Execute each set of independent filters
+            for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++)
+            {
+                const auto block_2_tile_map =
+                    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, InputGridDesc>(
+                        arg.out_grid_desc_m_k_container_[i]);
+                const index_t grid_size =
+                    block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]);
+                elapsed_time += launch_and_time_kernel(stream_config,
+                                                       kernel,
+                                                       dim3(grid_size),
+                                                       dim3(BlockSize),
+                                                       0,
+                                                       arg.in_grid_desc_m_k_container_[i],
+                                                       arg.p_in_container_[i],
+                                                       arg.out_grid_desc_m_k_container_[i],
+                                                       arg.p_out_container_[i],
+                                                       block_2_tile_map);
+            }
+            return elapsed_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const Argument& arg)
+    {
+        using namespace tensor_layout::convolution;
+        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+                       std::is_same_v<ImageLayout, GNDHWC>))
+        {
+            return false;
+        }
+
+        const auto w_pad_left  = arg.input_left_pads_[NDimSpatial - I1];
+        const auto w_pad_right = arg.input_right_pads_[NDimSpatial - I1];
+        const auto dilation_x  = arg.conv_filter_dilations_[NDimSpatial - I1];
+        const auto stride_x    = arg.conv_filter_strides_[NDimSpatial - I1];
+        bool is_w_packed       = arg.image_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_;
+        bool is_c_packed       = arg.image_g_n_c_wis_strides_[I2] == 1;
+
+        // check vector acces with c not packed
+        if(!is_c_packed && ScalarPerVector != 1)
+            return false;
+        // check vector access of filter window row (only C if C is not packed)
+        if(!is_w_packed && arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of filter window row (X * C)
+        if(arg.X_ * arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of pads (w_pad_left/w_pad_right * C)
+        if(w_pad_left * arg.C_ % ScalarPerVector != 0 ||
+           w_pad_right * arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of with stride and pad
+        if((w_pad_left != 0 || w_pad_right != 0) && stride_x > 1 && arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of with dilation
+        if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0)
+            return false;
+
+        bool valid = true;
+        for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++)
+        {
+            valid &= GridwiseTensorRearrangeKernel::CheckValidity(
+                arg.in_grid_desc_m_k_container_[i], arg.out_grid_desc_m_k_container_[i]);
+        }
+        return valid;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_in, // input image
+                             void* p_out,      // output image
+                             const ck::index_t N,
+                             const ck::index_t C,
+                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                             const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                             const std::array<index_t, 2>& gemm_m_k_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                             const std::array<index_t, NDimSpatial>& input_left_pads,
+                             const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        return Argument{static_cast<const InputDataType*>(p_in),
+                        static_cast<OutputDataType*>(p_out),
+                        N,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        image_g_n_c_wis_strides,
+                        gemm_m_k_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in, // input image
+                        void* p_out,      // output image
+                        const ck::index_t N,
+                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& gemm_m_k_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads) override
+    {
+        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
+                                          static_cast<OutputDataType*>(p_out),
+                                          N,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          image_g_n_c_wis_strides,
+                                          gemm_m_k_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceColumnToImage"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << KPerBlock << ", "
+            << ScalarPerVector
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
new file mode 100644
index 0000000000..827a341a50
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -0,0 +1,766 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_abd.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename AsPointer,
+          typename BsPointer,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AsGridDesc_AK0_M_AK1,
+          typename BsGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_multiple_abd_xdl_cshuffle(
+            AsPointer p_as_grid,
+            BsPointer p_bs_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1,
+            const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_as_grid,
+                                                  p_bs_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  as_grid_desc_ak0_m_ak1,
+                                                  bs_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_as_grid;
+    ignore = p_bs_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = as_grid_desc_ak0_m_ak1;
+    ignore = bs_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsLayout,
+          typename BsLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AsDataType,
+          typename BsDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct DeviceGemmMultipleABD_Xdl_CShuffle : public DeviceGemmMultipleABD<AsLayout,
+                                                                         BsLayout,
+                                                                         DsLayout,
+                                                                         ELayout,
+                                                                         AsDataType,
+                                                                         BsDataType,
+                                                                         DsDataType,
+                                                                         EDataType,
+                                                                         AElementwiseOperation,
+                                                                         BElementwiseOperation,
+                                                                         CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmMultipleABD_Xdl_CShuffle;
+
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+#if 0
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideAs)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, AsLayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideAs, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, AsLayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideAs));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideBs)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BsLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideBs));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BsLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideBs, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+#endif
+    using ComputeDataType = EDataType;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleABD_xdl_cshuffle<
+        AsDataType,
+        BsDataType,
+        ComputeDataType,
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        PipelineVer>;
+
+    // desc for problem definition
+    using AsGridDesc_M_K =
+        remove_cvref_t<decltype(GridwiseGemm::template MakeAsGridDescriptor_M_K<AsLayout, GemmSpec>(
+            {}, {}, {}))>;
+    using BsGridDesc_N_K =
+        remove_cvref_t<decltype(GridwiseGemm::template MakeBsGridDescriptor_N_K<BsLayout, GemmSpec>(
+            {}, {}, {}))>;
+    using DsGridDesc_M_N =
+        remove_cvref_t<decltype(GridwiseGemm::template MakeDsGridDescriptor_M_N<DsLayout, GemmSpec>(
+            {}, {}, {}))>;
+    using EGridDesc_M_N =
+        decltype(GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(1, 1, 1));
+
+    // desc for blockwise copy
+    using AsGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(AsGridDesc_M_K{}))>;
+    using BsGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(BsGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(std::array<const void*, NumATensor> p_as_grid,
+                 std::array<const void*, NumBTensor> p_bs_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 std::array<index_t, NumATensor> StrideAs,
+                 std::array<index_t, NumBTensor> StrideBs,
+                 std::array<index_t, NumDTensor> StrideDs,
+                 index_t StrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_as_grid_{},
+              p_bs_grid_{},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              as_grid_desc_m_k_{},
+              bs_grid_desc_n_k_{},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{GridwiseGemm::template MakeEGridDescriptor_M_N<ELayout, GemmSpec>(
+                  MRaw, NRaw, StrideE)},
+              as_grid_desc_ak0_m_ak1_{},
+              bs_grid_desc_bk0_n_bk1_{},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              MRaw_{MRaw},
+              NRaw_{NRaw},
+              KRaw_{KRaw}
+        {
+            // populate pointer, desc for As
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ALayout   = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+                using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+                // A pointer
+                p_as_grid_(i) = static_cast<const ADataType*>(p_as_grid[i]);
+
+                // A desc
+                as_grid_desc_m_k_(i) =
+                    GridwiseGemm::template MakeAGridDescriptor_M_K<ALayout, GemmSpec>(
+                        MRaw, KRaw, StrideAs[i]);
+            });
+
+            // populate pointer, desc for Bs
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BLayout   = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+                using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+                // B pointer
+                p_bs_grid_(i) = static_cast<const BDataType*>(p_bs_grid[i]);
+
+                // B desc
+                bs_grid_desc_n_k_(i) =
+                    GridwiseGemm::template MakeBGridDescriptor_N_K<BLayout, GemmSpec>(
+                        KRaw, NRaw, StrideBs[i]);
+            });
+
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    GridwiseGemm::template MakeEGridDescriptor_M_N<DLayout, GemmSpec>(
+                        MRaw, NRaw, StrideDs[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(as_grid_desc_m_k_,
+                                           bs_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                as_grid_desc_ak0_m_ak1_ =
+                    GridwiseGemm::MakeAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k_);
+
+                bs_grid_desc_bk0_n_bk1_ =
+                    GridwiseGemm::MakeBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            // std::cout << "A[M, K]: " << as_grid_desc_m_k_ << std::endl;
+            // std::cout << "B[N, K]: " << bs_grid_desc_n_k_ << std::endl;
+            // static_for<0, NumDTensor, 1>{}(
+            //[&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            // std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        typename GridwiseGemm::AsGridPointer p_as_grid_;
+        typename GridwiseGemm::BsGridPointer p_bs_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AsGridDesc_M_K as_grid_desc_m_k_;
+        BsGridDesc_N_K bs_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1_;
+        BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.as_grid_desc_m_k_,
+                                            arg.bs_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_gemm_multiple_abd_xdl_cshuffle<
+                    GridwiseGemm,
+                    typename GridwiseGemm::AsGridPointer,
+                    typename GridwiseGemm::BsGridPointer,
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AsGridDesc_AK0_M_AK1,
+                    DeviceOp::BsGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::Block2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_as_grid_,
+                                              arg.p_bs_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.as_grid_desc_ak0_m_ak1_,
+                                              arg.bs_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_);
+            };
+
+            const auto K = arg.as_grid_desc_m_k_[I0].GetLength(I1);
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+            bool all_valid = true;
+
+            static_for<0, NumATensor, 1>{}([&](auto i) {
+                using ALayout = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+                // check vector load of A
+                if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+                {
+                    if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+                {
+                    // FIXME: not rigorous
+                    if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else
+                {
+                    all_valid = false;
+                }
+            });
+
+            static_for<0, NumBTensor, 1>{}([&](auto i) {
+                using BLayout = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+                // check vector laod of B
+                if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+                {
+                    if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+                {
+                    // FIXME: not rigorous
+                    if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                    {
+                        all_valid = false;
+                    }
+                }
+                else
+                {
+                    all_valid = false;
+                }
+            });
+
+            // check vector load of Ds
+            // only support RowMajor for now
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+
+            if(!all_valid)
+            {
+                return false;
+            }
+
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        return GridwiseGemm::CheckValidity(arg.as_grid_desc_m_k_,
+                                           arg.bs_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::array<const void*, NumATensor> p_as,
+                             std::array<const void*, NumBTensor> p_bs,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             std::array<index_t, NumATensor> StrideAs,
+                             std::array<index_t, NumBTensor> StrideBs,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_as,
+                        p_bs,
+                        p_ds,
+                        p_e,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideAs,
+                        StrideBs,
+                        StrideDs,
+                        StrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumATensor> p_as,
+                        std::array<const void*, NumBTensor> p_bs,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        std::array<ck::index_t, NumATensor> StrideAs,
+                        std::array<ck::index_t, NumBTensor> StrideBs,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_as,
+                                          p_bs,
+                                          p_ds,
+                                          p_e,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideAs,
+                                          StrideBs,
+                                          StrideDs,
+                                          StrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceGemmMultipleABD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">"
+            << " LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
index 19f126e66f..8b4ae7875c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
@@ -5,64 +5,41 @@
 
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
 #include "ck/host_utility/io.hpp"
 
 namespace ck {
 namespace tensor_operation {
 namespace device {
 
-template <typename InputGridDesc,
-          typename InputDataType,
-          typename OutputGridDesc,
-          typename OutputDataType,
-          typename Block2ETileMap,
-          typename GridwiseImageToColumnKernel>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
-#endif
-        kernel_image_to_column(const InputGridDesc in_grid_desc,
-                               const InputDataType* __restrict__ p_in_global,
-                               const OutputGridDesc out_grid_desc,
-                               OutputDataType* __restrict__ p_out_global,
-                               const Block2ETileMap block_2_tile_map)
-{
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
-    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
-    defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
-    GridwiseImageToColumnKernel::Run(
-        in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
-#else
-    ignore = in_grid_desc;
-    ignore = p_in_global;
-    ignore = out_grid_desc;
-    ignore = p_out_global;
-    ignore = block_2_tile_map;
-#endif
-}
-
 // Image to column for input layout NDHWC:
-//   input : input image [N, Di, Hi, Wi, C],
-//   output : output image [N * Do * Ho * Wo, Z * Y * X * C]
+//   input : input image [N, Di, Hi, Wi, C]
+//   output : gemm form [N * Do * Ho * Wo, Z * Y * X * C]
 template <index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
           typename InputDataType,
           typename OutputDataType,
           index_t BlockSize,
           index_t MPerBlock,
           index_t KPerBlock,
           typename ThreadClusterLengths,
-          index_t ScalarPerVector>
+          index_t ScalarPerVector,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
 struct DeviceImageToColumnImpl
-    : public DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>
+    : public DeviceConvTensorRearrange<NDimSpatial,
+                                       ImageLayout,
+                                       InputDataType,
+                                       OutputDataType,
+                                       conv_tensor_rearrange_op::ImageToColumn>
 {
 
     static constexpr auto I0 = Number<0>{};
@@ -83,7 +60,7 @@ struct DeviceImageToColumnImpl
                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                            const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                            const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                             const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -110,9 +87,9 @@ struct DeviceImageToColumnImpl
         c_g_n_k_wos_lengths[I1] = N;
 
         const auto in_gemmmraw_gemmkraw_desc =
-            conv_to_gemm_transformer.template MakeADescriptor_M_K<InputLayout>(
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ImageLayout>(
                 a_g_n_c_wis_lengths,
-                input_g_n_c_wis_strides,
+                image_g_n_c_wis_strides,
                 b_g_k_c_xs_lengths,
                 {}, // not needed for A Descriptor
                 c_g_n_k_wos_lengths,
@@ -132,7 +109,7 @@ struct DeviceImageToColumnImpl
                           const ck::index_t C,
                           const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                           const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                          const std::array<index_t, 2>& output_m_k_strides)
+                          const std::array<index_t, 2>& gemm_m_k_strides)
     {
         const index_t NDoHoWo =
             N * ck::accumulate_n<index_t>(
@@ -141,7 +118,7 @@ struct DeviceImageToColumnImpl
             C * ck::accumulate_n<index_t>(
                     filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
         const auto desc_mraw_kraw = make_naive_tensor_descriptor(
-            make_tuple(NDoHoWo, CZYX), make_tuple(output_m_k_strides[I0], output_m_k_strides[I1]));
+            make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1]));
 
         const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
         return desc_m_k;
@@ -155,28 +132,29 @@ struct DeviceImageToColumnImpl
         decltype(BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
             OutputGridDesc{}))>;
 
-    using GridwiseImageToColumnKernel = GridwiseImageToColumn<InputGridDesc,
-                                                              InputDataType,
-                                                              OutputGridDesc,
-                                                              OutputDataType,
-                                                              BlockSize,
-                                                              MPerBlock,
-                                                              KPerBlock,
-                                                              ThreadClusterLengths,
-                                                              ScalarPerVector,
-                                                              Block2ETileMap>;
+    using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange<InputGridDesc,
+                                                                  InputDataType,
+                                                                  OutputGridDesc,
+                                                                  OutputDataType,
+                                                                  BlockSize,
+                                                                  MPerBlock,
+                                                                  KPerBlock,
+                                                                  ThreadClusterLengths,
+                                                                  ScalarPerVector,
+                                                                  InMemoryDataOperationEnum::Set,
+                                                                  Block2ETileMap>;
 
     struct Argument : public BaseArgument
     {
         Argument(const void* p_in, // input image
-                 void* p_out,      // output image
+                 void* p_out,      // gemm form
                  const ck::index_t N,
                  const ck::index_t C,
                  const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                  const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                  const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                 const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
-                 const std::array<index_t, 2>& output_m_k_strides,
+                 const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                 const std::array<index_t, 2>& gemm_m_k_strides,
                  const std::array<index_t, NDimSpatial>& conv_filter_strides,
                  const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                  const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -185,7 +163,7 @@ struct DeviceImageToColumnImpl
               X_(filter_spatial_lengths[NDimSpatial - I1]),
               p_in_{static_cast<const InputDataType*>(p_in)},
               p_out_{static_cast<OutputDataType*>(p_out)},
-              input_g_n_c_wis_strides_{input_g_n_c_wis_strides},
+              image_g_n_c_wis_strides_{image_g_n_c_wis_strides},
               conv_filter_strides_{conv_filter_strides},
               conv_filter_dilations_{conv_filter_dilations},
               input_left_pads_{input_left_pads},
@@ -197,7 +175,7 @@ struct DeviceImageToColumnImpl
                                                         input_spatial_lengths,
                                                         filter_spatial_lengths,
                                                         output_spatial_lengths,
-                                                        input_g_n_c_wis_strides,
+                                                        image_g_n_c_wis_strides,
 
                                                         conv_filter_strides,
                                                         conv_filter_dilations,
@@ -205,7 +183,7 @@ struct DeviceImageToColumnImpl
                                                         input_right_pads);
 
             out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
-                N, C, filter_spatial_lengths, output_spatial_lengths, output_m_k_strides);
+                N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides);
         }
 
         void Print() const
@@ -220,7 +198,7 @@ struct DeviceImageToColumnImpl
         const InputDataType* p_in_;
         OutputDataType* p_out_;
 
-        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides_;
+        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides_;
         const std::array<index_t, NDimSpatial>& conv_filter_strides_;
         const std::array<index_t, NDimSpatial>& conv_filter_dilations_;
         const std::array<index_t, NDimSpatial>& input_left_pads_;
@@ -243,12 +221,12 @@ struct DeviceImageToColumnImpl
                 BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
                     arg.out_grid_desc_m_k_);
             const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_);
-            const auto kernel       = kernel_image_to_column<InputGridDesc,
-                                                       InputDataType,
-                                                       OutputGridDesc,
-                                                       OutputDataType,
-                                                       Block2ETileMap,
-                                                       GridwiseImageToColumnKernel>;
+            const auto kernel       = kernel_tensor_rearrange<InputGridDesc,
+                                                        InputDataType,
+                                                        OutputGridDesc,
+                                                        OutputDataType,
+                                                        Block2ETileMap,
+                                                        GridwiseTensorRearrangeKernel>;
 
             float elapsed_time = launch_and_time_kernel(stream_config,
                                                         kernel,
@@ -273,12 +251,8 @@ struct DeviceImageToColumnImpl
     bool IsSupportedArgument(const Argument& arg)
     {
         using namespace tensor_layout::convolution;
-        if(!(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
-             std::is_same_v<InputLayout, GNDHWC>))
-        {
-            return false;
-        }
-        if(!(NDimSpatial >= 1 && NDimSpatial <= 3))
+        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+                       std::is_same_v<ImageLayout, GNDHWC>))
         {
             return false;
         }
@@ -287,8 +261,8 @@ struct DeviceImageToColumnImpl
         const auto w_pad_right = arg.input_right_pads_[NDimSpatial - I1];
         const auto dilation_x  = arg.conv_filter_dilations_[NDimSpatial - I1];
         const auto stride_x    = arg.conv_filter_strides_[NDimSpatial - I1];
-        bool is_w_packed       = arg.input_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_;
-        bool is_c_packed       = arg.input_g_n_c_wis_strides_[I2] == 1;
+        bool is_w_packed       = arg.image_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_;
+        bool is_c_packed       = arg.image_g_n_c_wis_strides_[I2] == 1;
 
         // check vector acces with c not packed
         if(!is_c_packed && ScalarPerVector != 1)
@@ -310,8 +284,8 @@ struct DeviceImageToColumnImpl
         if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0)
             return false;
 
-        return GridwiseImageToColumnKernel::CheckValidity(arg.in_grid_desc_m_k_,
-                                                          arg.out_grid_desc_m_k_);
+        return GridwiseTensorRearrangeKernel::CheckValidity(arg.in_grid_desc_m_k_,
+                                                            arg.out_grid_desc_m_k_);
     }
 
     bool IsSupportedArgument(const BaseArgument* p_arg) override
@@ -320,14 +294,14 @@ struct DeviceImageToColumnImpl
     }
 
     static auto MakeArgument(const void* p_in, // input image
-                             void* p_out,      // output image
+                             void* p_out,      // gemm form
                              const ck::index_t N,
                              const ck::index_t C,
                              const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                              const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                              const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                             const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
-                             const std::array<index_t, 2>& output_m_k_strides,
+                             const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                             const std::array<index_t, 2>& gemm_m_k_strides,
                              const std::array<index_t, NDimSpatial>& conv_filter_strides,
                              const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                              const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -340,8 +314,8 @@ struct DeviceImageToColumnImpl
                         input_spatial_lengths,
                         filter_spatial_lengths,
                         output_spatial_lengths,
-                        input_g_n_c_wis_strides,
-                        output_m_k_strides,
+                        image_g_n_c_wis_strides,
+                        gemm_m_k_strides,
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
@@ -352,14 +326,14 @@ struct DeviceImageToColumnImpl
 
     std::unique_ptr<BaseArgument>
     MakeArgumentPointer(const void* p_in, // input image
-                        void* p_out,      // output image
+                        void* p_out,      // gemm form
                         const ck::index_t N,
                         const ck::index_t C,
                         const std::array<index_t, NDimSpatial>& input_spatial_lengths,
                         const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
                         const std::array<index_t, NDimSpatial>& output_spatial_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
-                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, NDimSpatial + 3>& image_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& gemm_m_k_strides,
                         const std::array<index_t, NDimSpatial>& conv_filter_strides,
                         const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                         const std::array<index_t, NDimSpatial>& input_left_pads,
@@ -372,8 +346,8 @@ struct DeviceImageToColumnImpl
                                           input_spatial_lengths,
                                           filter_spatial_lengths,
                                           output_spatial_lengths,
-                                          input_g_n_c_wis_strides,
-                                          output_m_k_strides,
+                                          image_g_n_c_wis_strides,
+                                          gemm_m_k_strides,
                                           conv_filter_strides,
                                           conv_filter_dilations,
                                           input_left_pads,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
new file mode 100644
index 0000000000..d98cbea166
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp
@@ -0,0 +1,1033 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r2.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+namespace ck {
+
+// GEMM:
+//   input : A0[M, K], A1[M, K]
+//   input : B0[N, K], B1[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename AsDataType,
+          typename BsDataType,
+          typename ComputeDataType_,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemmMultipleABD_xdl_cshuffle
+{
+    static constexpr index_t NumATensor = AsDataType::Size();
+    static constexpr index_t NumBTensor = BsDataType::Size();
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    using GemmSpecialization = ck::tensor_operation::device::GemmSpecialization;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK1         = Number<AK1Value>{};
+    static constexpr auto BK1         = Number<BK1Value>{};
+    static constexpr auto AK0PerBlock = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0PerBlock = Number<KPerBlock / BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<
+        decltype(GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+
+#if CK_WORKAROUND_DENORM_FIX
+    using ComputeDataType =
+        conditional_t<is_same_v<ComputeDataType_, ck::half_t>, ck::bhalf_t, ComputeDataType_>;
+#else
+    using ComputeDataType = ComputeDataType_;
+#endif
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    static constexpr auto MakeAsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+
+                return static_cast<const ADataType*>(nullptr);
+            },
+            Number<NumATensor>{});
+    }
+
+    static constexpr auto MakeBsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+
+                return static_cast<const BDataType*>(nullptr);
+            },
+            Number<NumBTensor>{});
+    }
+
+    // ck::Tuple<const D0DataType*, const D1DataType*, ...>
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(ComputeDataType),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // A desc for source in blockwise copy
+    template <typename AGridDesc_M_K>
+    __host__ __device__ static constexpr auto
+    MakeAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k)
+    {
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename AsGridDesc_M_K>
+    __host__ __device__ static constexpr auto
+    MakeAsGridDescriptor_AK0_M_AK1(const AsGridDesc_M_K& as_grid_desc_m_k)
+    {
+        return generate_tuple(
+            [&](auto i) { return MakeAGridDescriptor_AK0_M_AK1(as_grid_desc_m_k[i]); },
+            Number<NumATensor>{});
+    }
+
+    // B desc for source in blockwise copy
+    template <typename BGridDesc_N_K>
+    __host__ __device__ static constexpr auto
+    MakeBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k)
+    {
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename BsGridDesc_N_K>
+    __host__ __device__ static constexpr auto
+    MakeBsGridDescriptor_BK0_N_BK1(const BsGridDesc_N_K& bs_grid_desc_n_k)
+    {
+        return generate_tuple(
+            [&](auto i) { return MakeBGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k[i]); },
+            Number<NumBTensor>{});
+    }
+
+    // E desc for destination in blockwise copy
+    template <typename EGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N& e_grid_desc_m_n)
+    {
+        const auto M = e_grid_desc_m_n.GetLength(I0);
+        const auto N = e_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            e_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return e_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // Ds desc for source in blockwise copy
+    template <typename DsGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N& ds_grid_desc_m_n)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // return block_id to E matrix tile idx (m0, n0) mapping
+    template <typename EGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, EGridDesc_M_N>(
+            e_grid_desc_m_n);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename AsGridDesc_M_K,
+              typename BsGridDesc_N_K,
+              typename DsGridDesc_M_N,
+              typename EGridDesc_M_N,
+              typename Block2ETileMap>
+    __host__ __device__ static constexpr bool CheckValidity(const AsGridDesc_M_K& as_grid_desc_m_k,
+                                                            const BsGridDesc_N_K& bs_grid_desc_n_k,
+                                                            const DsGridDesc_M_N& ds_grid_desc_m_n,
+                                                            const EGridDesc_M_N& e_grid_desc_m_n,
+                                                            const Block2ETileMap& block_2_etile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+        static_assert(KPerBlock % AK1Value == 0 && KPerBlock % BK1Value == 0,
+                      "KPerBlock must be divisible by AK1Value and BK1Value!");
+
+        const auto M  = as_grid_desc_m_k[I0].GetLength(I0);
+        const auto N  = bs_grid_desc_n_k[I0].GetLength(I0);
+        const auto AK = as_grid_desc_m_k[I0].GetLength(I1);
+        const auto BK = bs_grid_desc_n_k[I0].GetLength(I1);
+
+        // check consistency of desc
+        if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) && AK == BK))
+        {
+            return false;
+        }
+
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        bool valid = true;
+        static_for<0, NumATensor, 1>{}([&](auto i) {
+            using ADataType = remove_cvref_t<tuple_element_t<i.value, AsDataType>>;
+            valid =
+                valid && (as_grid_desc_m_k[i].GetElementSpaceSize() * sizeof(ADataType) <= TwoGB);
+            valid = valid && (M == as_grid_desc_m_k[i].GetLength(I0) &&
+                              AK == as_grid_desc_m_k[i].GetLength(I1));
+        });
+
+        static_for<0, NumBTensor, 1>{}([&](auto i) {
+            using BDataType = remove_cvref_t<tuple_element_t<i.value, BsDataType>>;
+            valid =
+                valid && (bs_grid_desc_n_k[i].GetElementSpaceSize() * sizeof(BDataType) <= TwoGB);
+            valid = valid && (N == bs_grid_desc_n_k[i].GetLength(I0) &&
+                              BK == bs_grid_desc_n_k[i].GetLength(I1));
+        });
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            valid = valid && (M == ds_grid_desc_m_n[i].GetLength(I0) &&
+                              N == ds_grid_desc_m_n[i].GetLength(I1));
+        });
+
+        if(!valid)
+        {
+            return false;
+        }
+
+        // check tile size
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && AK % KPerBlock == 0))
+        {
+            return false;
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = AK / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        // check block-to-E-tile
+        if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        // check tensor size: cannot be larger than 2GB each
+
+        if(!(e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    using AsGridPointer = decltype(MakeAsGridPointer());
+    using BsGridPointer = decltype(MakeBsGridPointer());
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    template <typename ALayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        constexpr auto matrix_padder =
+            ck::tensor_operation::device::MatrixPadder<GemmSpec, index_t, index_t, index_t>{
+                MPerBlock, NPerBlock, KPerBlock};
+
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    template <typename AsLayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeAsGridDescriptor_M_K(const std::array<index_t, NumATensor>& MRaws,
+                             const std::array<index_t, NumATensor>& KRaws,
+                             const std::array<index_t, NumATensor>& AsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using ALayout = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
+
+                return MakeEGridDescriptor_M_N<ALayout, GemmSpec>(MRaws[i], KRaws[i], AsStride[i]);
+            },
+            Number<NumATensor>{});
+    }
+
+    template <typename BLayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        constexpr auto matrix_padder =
+            ck::tensor_operation::device::MatrixPadder<GemmSpec, index_t, index_t, index_t>{
+                MPerBlock, NPerBlock, KPerBlock};
+
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename BsLayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeBsGridDescriptor_N_K(const std::array<index_t, NumBTensor>& KRaws,
+                             const std::array<index_t, NumBTensor>& NRaws,
+                             const std::array<index_t, NumBTensor>& BsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using BLayout = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
+
+                return MakeBGridDescriptor_N_K<BLayout, GemmSpec>(KRaws[i], NRaws[i], BsStride[i]);
+            },
+            Number<NumBTensor>{});
+    }
+
+    template <typename ELayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        constexpr auto matrix_padder =
+            ck::tensor_operation::device::MatrixPadder<GemmSpec, index_t, index_t, index_t>{
+                MPerBlock, NPerBlock, KPerBlock};
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    template <typename DsLayout, GemmSpecialization GemmSpec>
+    __host__ __device__ static auto
+    MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                             const std::array<index_t, NumDTensor>& NRaws,
+                             const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return MakeEGridDescriptor_M_N<DLayout, GemmSpec>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    __device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; }
+
+    template <bool HasMainKBlockLoop,
+              typename AsGridDesc_AK0_M_AK1,
+              typename BsGridDesc_BK0_N_BK1,
+              typename DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename Block2ETileMap>
+    __device__ static void Run(AsGridPointer p_as_grid,
+                               BsGridPointer p_bs_grid,
+                               DsGridPointer p_ds_grid,
+                               EDataType* __restrict__ p_e_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CDEElementwiseOperation& cde_element_op,
+                               const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1,
+                               const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1,
+                               const DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   e_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2ETileMap& block_2_etile_map)
+    {
+        const auto as_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_as_grid[i], as_grid_desc_ak0_m_ak1[i].GetElementSpaceSize());
+            },
+            Number<NumATensor>{});
+
+        const auto bs_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_bs_grid[i], bs_grid_desc_bk0_n_bk1[i].GetElementSpaceSize());
+            },
+            Number<NumBTensor>{});
+
+        const auto ds_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_ds_grid[i],
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
+            },
+            Number<NumDTensor>{});
+
+        auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_etile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        const auto idx_as_block_begin =
+            generate_tuple([&](auto) { return make_multi_index(0, m_block_data_idx_on_grid, 0); },
+                           Number<NumATensor>{});
+
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v7r2<
+            ThisThreadBlock,
+            AsDataType,
+            Tuple<ComputeDataType>,
+            decltype(as_grid_desc_ak0_m_ak1),
+            decltype(tie(a_block_desc_ak0_m_ak1)),
+            AElementwiseOperation,
+            Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
+            Sequence<AK0PerBlock, MPerBlock, AK1>,
+            ABlockTransferThreadClusterLengths_AK0_M_AK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            ABlockTransferSrcAccessOrder,
+            Sequence<1, 0, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_AK1,
+            uniform_sequence_gen_t<NumATensor, false>,
+            Sequence<true>>{as_grid_desc_ak0_m_ak1,
+                            idx_as_block_begin,
+                            tie(a_block_desc_ak0_m_ak1),
+                            make_tuple(make_multi_index(0, 0, 0)),
+                            a_element_op};
+
+        const auto idx_bs_block_begin =
+            generate_tuple([&](auto) { return make_multi_index(0, n_block_data_idx_on_grid, 0); },
+                           Number<NumBTensor>{});
+
+        auto b_blockwise_copy = ThreadGroupTensorSliceTransfer_v7r2<
+            ThisThreadBlock,
+            BsDataType,
+            Tuple<ComputeDataType>,
+            decltype(bs_grid_desc_bk0_n_bk1),
+            decltype(tie(b_block_desc_bk0_n_bk1)),
+            BElementwiseOperation,
+            Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
+            Sequence<BK0PerBlock, NPerBlock, BK1>,
+            BBlockTransferThreadClusterLengths_BK0_N_BK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            BBlockTransferSrcAccessOrder,
+            Sequence<1, 0, 2>,
+            BBlockTransferSrcVectorDim,
+            2,
+            BBlockTransferSrcScalarPerVector,
+            BBlockTransferDstScalarPerVector_BK1,
+            uniform_sequence_gen_t<NumBTensor, false>,
+            Sequence<true>>{bs_grid_desc_bk0_n_bk1,
+                            idx_bs_block_begin,
+                            tie(b_block_desc_bk0_n_bk1),
+                            make_tuple(make_multi_index(0, 0, 0)),
+                            b_element_op};
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack =
+            math::max(math::lcm(AK1, BK1),
+                      MfmaSelector<ComputeDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            ComputeDataType,
+            AccDataType,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ComputeDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ComputeDataType*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (as_grid_desc_ak0_m_ak1[I0].GetLength(I0) * as_grid_desc_ak0_m_ak1[I0].GetLength(I2)) /
+            KPerBlock);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>();
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(as_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               as_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               bs_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               bs_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin = container_concat(
+                make_tuple(make_multi_index(0, 0, 0, 0)),
+                generate_tuple(
+                    [&](auto) {
+                        return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0);
+                    },
+                    Number<NumDTensor>{}));
+
+            // blockwise copy C/D/E between LDS and global
+            auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r2<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CDEElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
+                Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
+                3,                    // index_t SrcVectorDim,
+                3,                    // index_t DstVectorDim,
+                CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+                CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>>                    // ThreadTransferDstResetCoordinateAfterRunFlags
+                {c_ds_desc_refs,
+                 idx_c_ds_block_begin,
+                 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                 make_tuple(make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0)),
+                 cde_element_op};
+
+            // space filling curve for threadwise C in VGPR before shuffle
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(e_grid_buf));
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              GemmSpecialization GemmSpec,
+              typename AsLayout,
+              typename BsLayout,
+              typename DsLayout,
+              typename ELayout,
+              typename Block2ETileMap>
+    __device__ static void Run(AsGridPointer p_as_grid,
+                               BsGridPointer p_bs_grid,
+                               DsGridPointer p_ds_grid,
+                               void* __restrict__ p_e_grid_,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CDEElementwiseOperation& cde_element_op,
+                               const index_t M,
+                               const index_t N,
+                               const index_t K,
+                               const std::array<index_t, NumATensor> StrideAs,
+                               const std::array<index_t, NumBTensor> StrideBs,
+                               const std::array<index_t, NumDTensor> StrideDs,
+                               const index_t StrideE,
+                               const Block2ETileMap& block_2_etile_map)
+    {
+        using AsGridDesc_M_K =
+            remove_cvref_t<decltype(MakeAsGridDescriptor_M_K<AsLayout, GemmSpec>({}, {}, {}))>;
+        using BsGridDesc_N_K =
+            remove_cvref_t<decltype(MakeBsGridDescriptor_N_K<BsLayout, GemmSpec>({}, {}, {}))>;
+        using DsGridDesc_M_N =
+            remove_cvref_t<decltype(MakeDsGridDescriptor_M_N<DsLayout, GemmSpec>({}, {}, {}))>;
+
+        const auto p_e_grid = reinterpret_cast<EDataType*>(p_e_grid_);
+
+        AsGridDesc_M_K as_grid_desc_m_k;
+        BsGridDesc_N_K bs_grid_desc_n_k;
+        DsGridDesc_M_N ds_grid_desc_m_n;
+
+        static_for<0, NumATensor, 1>{}([&](auto j) {
+            using ALayout = remove_cvref_t<tuple_element_t<j.value, AsLayout>>;
+
+            as_grid_desc_m_k(j) = MakeAGridDescriptor_M_K<ALayout, GemmSpec>(M, K, StrideAs[j]);
+        });
+
+        static_for<0, NumBTensor, 1>{}([&](auto j) {
+            using BLayout = remove_cvref_t<tuple_element_t<j.value, BsLayout>>;
+
+            bs_grid_desc_n_k(j) = MakeBGridDescriptor_N_K<BLayout, GemmSpec>(N, K, StrideBs[j]);
+        });
+
+        static_for<0, NumDTensor, 1>{}([&](auto j) {
+            using DLayout = remove_cvref_t<tuple_element_t<j.value, DsLayout>>;
+
+            ds_grid_desc_m_n(j) = MakeEGridDescriptor_M_N<DLayout, GemmSpec>(M, N, StrideDs[j]);
+        });
+
+        const auto e_grid_desc_m_n = MakeEGridDescriptor_M_N<ELayout, GemmSpec>(M, N, StrideE);
+
+        // tensor descriptors for block/thread-wise copy
+        const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(as_grid_desc_m_k);
+
+        const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(bs_grid_desc_n_k);
+
+        const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n);
+
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+            MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n);
+
+        Run<HasMainKBlockLoop>(p_as_grid,
+                               p_bs_grid,
+                               p_ds_grid,
+                               p_e_grid,
+                               p_shared,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               as_grid_desc_ak0_m_ak1,
+                               bs_grid_desc_bk0_n_bk1,
+                               ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                               e_grid_desc_mblock_mperblock_nblock_nperblock,
+                               block_2_etile_map);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
index d3d7d5af85..25e1cebdbe 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -9,13 +9,13 @@ namespace ck {
 
 struct GridwiseGemmPipeline_v2
 {
-    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
+    __host__ __device__ static constexpr bool IsSupported(const index_t num_loop)
     {
         // TODO: improve applicability
         return num_loop % 2 == 0;
     }
 
-    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(const index_t num_loop)
     {
         return (num_loop / 2) > 1;
     }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
index b6c146ae61..e941f96553 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -175,7 +175,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
         return math::integer_divide_ceil(N, NPerBlock) * NPerBlock;
     }
 
-    __host__ static auto CalculateK0(index_t K) { return math::integer_divide_floor(K, K1Value); }
+    __host__ static auto CalculateK0(index_t K) { return math::integer_divide_ceil(K, K1Value); }
 
     // Argument
     struct Problem
@@ -369,9 +369,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                       "Invalid tuning param!");
 
         // check gridwise gemm pipeline
-        const index_t K0      = problem.K / K1Value;
-        const auto num_k_loop = K0 / K0PerBlock;
-
+        const auto num_k_loop = math::integer_divide_ceil(problem.K0, K0PerBlock);
         if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
             return false;
@@ -1026,8 +1024,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
         }
 
         // check gridwise gemm pipeline
-        const index_t K0      = problem.K / K1;
-        const auto num_k_loop = K0 / K0PerBlock;
+        const auto num_k_loop = math::integer_divide_ceil(problem.K0, K0PerBlock);
 
         if(!GridwiseGemmPipe::IsSupported(num_k_loop))
         {
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
similarity index 52%
rename from include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
rename to include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
index 93625a324e..e4e47b4fae 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
@@ -16,6 +16,36 @@
 
 namespace ck {
 
+template <typename InputGridDesc,
+          typename InputDataType,
+          typename OutputGridDesc,
+          typename OutputDataType,
+          typename Block2ETileMap,
+          typename GridwiseTensorRearrangeKernel>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_tensor_rearrange(const InputGridDesc in_grid_desc,
+                                const InputDataType* __restrict__ p_in_global,
+                                const OutputGridDesc out_grid_desc,
+                                OutputDataType* __restrict__ p_out_global,
+                                const Block2ETileMap block_2_tile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
+    defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
+    GridwiseTensorRearrangeKernel::Run(
+        in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
+#else
+    ignore = in_grid_desc;
+    ignore = p_in_global;
+    ignore = out_grid_desc;
+    ignore = p_out_global;
+    ignore = block_2_tile_map;
+#endif
+}
+
 template <typename InputGridDesc,
           typename InputDataType,
           typename OutputGridDesc,
@@ -25,8 +55,9 @@ template <typename InputGridDesc,
           index_t KPerBlock,
           typename ThreadClusterLengths,
           index_t ScalarPerVector,
+          InMemoryDataOperationEnum DstInMemOp,
           typename Block2ETileMap>
-struct GridwiseImageToColumn
+struct GridwiseTensorRearrange
 {
 
     static constexpr auto I0 = Number<0>{};
@@ -55,27 +86,27 @@ struct GridwiseImageToColumn
         auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_out_global, out_grid_desc.GetElementSpaceSize());
 
-        auto copy_global_to_global = ThreadGroupTensorSliceTransfer_v7<
-            ThisThreadBlock,
-            Tuple<InputDataType>,
-            Tuple<OutputDataType>,
-            decltype(tie(in_grid_desc)),
-            decltype(tie(out_grid_desc)),
-            tensor_operation::element_wise::PassThrough,
-            Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
-            Sequence<MPerBlock, KPerBlock>,
-            ThreadClusterLengths,
-            Sequence<0, 1>,
-            Sequence<0, 1>,
-            I1,
-            ScalarPerVector,
-            Sequence<true>,
-            Sequence<true>>{
-            in_grid_desc,
-            make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
-            out_grid_desc,
-            make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
-            tensor_operation::element_wise::PassThrough{}};
+        auto copy_global_to_global =
+            ThreadGroupTensorSliceTransfer_v7<ThisThreadBlock,
+                                              Tuple<InputDataType>,
+                                              Tuple<OutputDataType>,
+                                              decltype(tie(in_grid_desc)),
+                                              decltype(tie(out_grid_desc)),
+                                              tensor_operation::element_wise::PassThrough,
+                                              Sequence<static_cast<index_t>(DstInMemOp)>,
+                                              Sequence<MPerBlock, KPerBlock>,
+                                              ThreadClusterLengths,
+                                              Sequence<0, 1>,
+                                              Sequence<0, 1>,
+                                              I1,
+                                              ScalarPerVector,
+                                              Sequence<true>,
+                                              Sequence<true>>{
+                in_grid_desc,
+                make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
+                out_grid_desc,
+                make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)),
+                tensor_operation::element_wise::PassThrough{}};
 
         copy_global_to_global.Run(
             tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp
new file mode 100644
index 0000000000..003a2b96d9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r2.hpp
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+#include "ck/utility/is_detected.hpp"
+
+namespace ck {
+
+// Thread-level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//   6. Does not need to know src_descs and dst_descs at compile-time
+//   7. Does not need to know src_slice_origins and dst_slice_origins at compile-time,
+//
+// Does following things to avoid scratch memory issue
+//   1. Use StaticallyIndexedArray or vector_type instead of C array for thread buffer
+//   2. Pass tensor descritpors by reference (or tuple of references)
+//   3. Does not keep reference to tensor descriptor
+//   4. Does not construct new tensor coordinate when call Run()
+template <typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
+          typename DstResetCoordinateAfterRunFlags> // Sequence<bool ...>
+struct ThreadwiseTensorSliceTransfer_v7r2
+{
+    static constexpr auto I0 = Number<0>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    static constexpr index_t nSrc = SrcDescs::Size();
+    static constexpr index_t nDst = DstDescs::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    // return a tuple of coordiantes for a tuple of tensor
+    template <typename Descs,
+              typename Indices,
+              enable_if_t<Descs::Size() == Indices::Size(), bool> = false>
+    static constexpr auto MakeCoordinates(const Descs& descs, const Indices& indices)
+    {
+        return generate_tuple([&](auto i) { return make_tensor_coordinate(descs[i], indices[i]); },
+                              Number<Descs::Size()>{});
+    }
+
+    using SrcCoords = decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray<Index, nSrc>{}));
+    using DstCoords = decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray<Index, nDst>{}));
+
+    // scalar per access on each dim
+    // FIXME: don't use lambda_scalar_per_access
+    static constexpr auto src_scalar_per_access = generate_sequence(
+        detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+    using SrcSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                   SrcDimAccessOrder,
+                                                   remove_cv_t<decltype(src_scalar_per_access)>>;
+
+    static constexpr auto dst_scalar_per_access = generate_sequence(
+        detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+    using DstSpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                   DstDimAccessOrder,
+                                                   remove_cv_t<decltype(dst_scalar_per_access)>>;
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v7r2(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_slice_origins,
+        const ElementwiseOperation& element_op)
+        : src_coords_(MakeCoordinates(src_descs, src_slice_origins)),
+          dst_coords_(MakeCoordinates(dst_descs, dst_slice_origins)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    template <typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetSrcSliceOrigins(const SrcDescs& src_descs,
+                                       const Indices& src_slice_origin_idxs)
+    {
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            src_coords_(i) = make_tensor_coordinate(src_descs[i], src_slice_origin_idxs[i]);
+        });
+    }
+
+    template <typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetDstSliceOrigins(const DstDescs& dst_descs,
+                                       const Indices& dst_slice_origin_idxs)
+    {
+        static_for<0, nDst, 1>{}([&](auto i) {
+            dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]);
+        });
+    }
+
+    template <typename DataTypes, index_t ScalarPerVector>
+    __device__ static auto generate_vectors()
+    {
+        auto data_types = DataTypes{};
+
+        constexpr index_t num = data_types.Size();
+
+        return generate_tuple(
+            [&](auto i) {
+                using DataType = remove_cvref_t<decltype(data_types[i])>;
+
+                return vector_type_maker_t<DataType, ScalarPerVector>{};
+            },
+            Number<num>{});
+    }
+
+    template <typename T>
+    using has_vec_len = decltype(std::declval<T&>().vec_len);
+
+    // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
+    // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
+    template <typename SrcBuffers,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size(), bool> = false>
+    __device__ void RunRead(const SrcDescs& src_descs, const SrcBuffers& src_bufs)
+    {
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            auto src_vectors = generate_vectors<SrcDatas, SrcScalarPerVector>();
+            auto dst_vectors = generate_vectors<DstDatas, DstScalarPerVector>();
+
+            // copy data from src_bufs into src_vectors
+            static_for<0, nSrc, 1>{}([&](auto i) {
+                using src_vector_t = typename remove_cvref_t<decltype(src_vectors[i])>::type;
+
+                const bool is_src_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(src_descs[i],
+                                                                                src_coords_[i]);
+
+                src_vectors(i).template AsType<src_vector_t>()(I0) =
+                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(),
+                                                           is_src_valid);
+            });
+
+            if constexpr(is_detected<has_vec_len, decltype(element_op_)>::value)
+            {
+                constexpr auto elem_op_vec_len = decltype(element_op_)::vec_len;
+
+                static_assert(is_same<remove_cvref_t<decltype(elem_op_vec_len)>, index_t>::value,
+                              "vec_len in element_op_ type is not index_t");
+
+                static_assert(elem_op_vec_len == 1 || elem_op_vec_len == 2 ||
+                                  elem_op_vec_len == 4 || elem_op_vec_len == 8,
+                              "vec_len in element_op_ must be 1, 2, 4, 8");
+
+                static_assert(SrcScalarPerVector % elem_op_vec_len == 0,
+                              "vec_len in element_op_ cannot be divided by SrcScalarPerVector!");
+
+                // apply pointwise function
+                static_for<0, SrcScalarPerVector / elem_op_vec_len, 1>{}([&](auto i) {
+                    // get reference to src data
+                    const auto src_data_refs = generate_tie(
+                        // return type should be lvalue
+                        [&](auto iSrc) -> const auto& {
+                            using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
+
+                            using elem_op_vec_t =
+                                typename vector_type<SrcData, elem_op_vec_len>::type;
+
+                            return src_vectors[iSrc].template AsType<elem_op_vec_t>()[i];
+                        },
+                        Number<nSrc>{});
+
+                    // get reference to dst data
+                    auto dst_data_refs = generate_tie(
+                        // return type should be lvalue
+                        [&](auto iDst) -> auto& {
+                            using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
+
+                            using elem_op_vec_t =
+                                typename vector_type<DstData, elem_op_vec_len>::type;
+
+                            return dst_vectors(iDst).template AsType<elem_op_vec_t>()(i);
+                        },
+                        Number<nDst>{});
+
+                    // apply pointwise function
+                    // pointwise function signature:
+                    // element_op_(dst_data_refs[I0],
+                    //             dst_data_refs[I1],
+                    //             ...,
+                    //             src_data_refs[I0],
+                    //             src_data_refs[I1],
+                    //             ...)
+                    unpack2(element_op_, dst_data_refs, src_data_refs);
+                });
+            }
+            else
+            {
+                // apply pointwise function
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    // get reference to src data
+                    const auto src_data_refs = generate_tie(
+                        // return type should be lvalue
+                        [&](auto iSrc) -> const auto& {
+                            using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
+
+                            return src_vectors[iSrc].template AsType<SrcData>()[i];
+                        },
+                        Number<nSrc>{});
+
+                    // get reference to dst data
+                    auto dst_data_refs = generate_tie(
+                        // return type should be lvalue
+                        [&](auto iDst) -> auto& {
+                            using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
+
+                            return dst_vectors(iDst).template AsType<DstData>()(i);
+                        },
+                        Number<nDst>{});
+
+                    // apply pointwise function
+                    // pointwise function signature:
+                    // element_op_(dst_data_refs[I0],
+                    //             dst_data_refs[I1],
+                    //             ...,
+                    //             src_data_refs[I0],
+                    //             src_data_refs[I1],
+                    //             ...)
+                    unpack2(element_op_, dst_data_refs, src_data_refs);
+                });
+            }
+
+            dst_vectors_tuple_(iAccess) = dst_vectors;
+
+            // move coordinate
+            if constexpr(iAccess.value != num_access - 1)
+            {
+                constexpr auto forward_step = SrcSpaceFillingCurve::GetForwardStep(iAccess);
+
+                static_for<0, nSrc, 1>{}([&](auto i) {
+                    move_tensor_coordinate(src_descs[i],
+                                           src_coords_(i),
+                                           make_tensor_coordinate_step(src_descs[i], forward_step));
+                });
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto src_reset_step =
+                    make_tensor_coordinate_step(src_descs[i], GetSrcCoordinateResetStep());
+
+                move_tensor_coordinate(src_descs[i], src_coords_(i), src_reset_step);
+            }
+        });
+    }
+
+    // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
+    // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
+    template <typename DstBuffers,
+              enable_if_t<DstDescs::Size() == DstBuffers::Size(), bool> = false>
+    __device__ void RunWrite(const DstDescs& dst_descs, DstBuffers dst_bufs)
+    {
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            auto dst_vectors = dst_vectors_tuple_[iAccess];
+
+            // copy data from buf_vectors into dst_bufs
+            static_for<0, nDst, 1>{}([&](auto i) {
+                using dst_vector_t = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
+
+                const bool is_dst_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
+                                                                                dst_coords_[i]);
+
+                constexpr InMemoryDataOperationEnum DstInMemOp =
+                    static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
+
+                dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
+                    dst_coords_[i].GetOffset(),
+                    is_dst_valid,
+                    dst_vectors[i].template AsType<dst_vector_t>()[I0]);
+            });
+
+            // move coordinate
+            if constexpr(iAccess.value != num_access - 1)
+            {
+                constexpr auto forward_step = DstSpaceFillingCurve::GetForwardStep(iAccess);
+
+                static_for<0, nDst, 1>{}([&](auto i) {
+                    move_tensor_coordinate(dst_descs[i],
+                                           dst_coords_(i),
+                                           make_tensor_coordinate_step(dst_descs[i], forward_step));
+                });
+            }
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            if constexpr(DstResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto dst_reset_step =
+                    make_tensor_coordinate_step(dst_descs[i], GetDstCoordinateResetStep());
+
+                move_tensor_coordinate(dst_descs[i], dst_coords_(i), dst_reset_step);
+            }
+        });
+    }
+
+    // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
+    // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
+    // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
+    // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
+    template <typename SrcBuffers,
+              typename DstBuffers,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
+                              DstDescs::Size() == DstBuffers::Size(),
+                          bool> = false>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        RunRead(src_descs, src_bufs);
+        RunWrite(dst_descs, dst_bufs);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        if constexpr(num_access == 0)
+        {
+            return typename SrcSpaceFillingCurve::Index{};
+        }
+        else
+        {
+            return SrcSpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+        }
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        if constexpr(num_access == 0)
+        {
+            return typename DstSpaceFillingCurve::Index{};
+        }
+        else
+        {
+            return DstSpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+        }
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <index_t ISrc>
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
+                                       Number<ISrc> iSrc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRunFlags::At(iSrc)
+                ? src_slice_origin_step_idx
+                : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_descs[iSrc], adjusted_step_idx);
+
+        move_tensor_coordinate(src_descs[iSrc], src_coords_(iSrc), adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <index_t IDst>
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs,
+                                       Number<IDst> iDst,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRunFlags::At(iDst)
+                ? dst_slice_origin_step_idx
+                : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_descs[iDst], adjusted_step_idx);
+
+        move_tensor_coordinate(dst_descs[iDst], dst_coords_(iDst), adjusted_step);
+    }
+
+    private:
+    using SrcVectorsType = decltype(generate_vectors<SrcDatas, SrcScalarPerVector>());
+    using DstVectorsType = decltype(generate_vectors<DstDatas, DstScalarPerVector>());
+
+    static constexpr auto num_access = SrcSpaceFillingCurve::GetNumOfAccess();
+
+    StaticallyIndexedArray<DstVectorsType, num_access> dst_vectors_tuple_;
+
+    SrcCoords src_coords_;
+    DstCoords dst_coords_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
index cee3d2825b..6f546f1d6d 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -20,348 +20,13 @@ struct TransformConvFwdToGemm
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
-    template <typename ALayout,
-              typename std::enable_if<NDimSpatial == 1 &&
-                                          is_same_v<ALayout, tensor_layout::convolution::GNWC>,
-                                      bool>::type = false>
-    static auto
-    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads)
-    {
-        const index_t N = a_g_n_c_wis_lengths[1];
-        const index_t C = a_g_n_c_wis_lengths[2];
-
-        const index_t Wi = a_g_n_c_wis_lengths[3];
-
-        const index_t Wo = c_g_n_k_wos_lengths[3];
-
-        const index_t ConvStrideW = conv_filter_strides[0];
-
-        if constexpr(ConvForwardSpecialization ==
-                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
-        {
-            const index_t NWo =
-                N * ck::accumulate_n<index_t>(
-                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
-
-            const auto in_gemmm_gemmk_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(NWo, C));
-
-            return in_gemmm_gemmk_desc;
-        }
-        else if constexpr(ConvForwardSpecialization ==
-                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
-        {
-            const auto in_n_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
-
-            const auto in_n_wo_c_desc = transform_tensor_descriptor(
-                in_n_wi_c_desc,
-                make_tuple(make_pass_through_transform(N),
-                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
-                           make_pass_through_transform(C)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
-                in_n_wo_c_desc,
-                make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)),
-                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            return in_gemmm_gemmk_desc;
-        }
-        else
-        {
-            const index_t X             = b_g_k_c_xs_lengths[3];
-            const index_t ConvDilationW = conv_filter_dilations[0];
-            const index_t InLeftPadW    = input_left_pads[0];
-            const index_t InRightPadW   = input_right_pads[0];
-
-            const auto in_n_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
-
-            const auto in_n_wip_c_desc = transform_tensor_descriptor(
-                in_n_wi_c_desc,
-                make_tuple(make_pass_through_transform(N),
-                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                           make_pass_through_transform(C)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
-
-            const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
-                in_n_wip_c_desc,
-                make_tuple(
-                    make_pass_through_transform(N),
-                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                    make_pass_through_transform(C)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
-
-            const auto in_gemmm_gemmk_desc =
-                transform_tensor_descriptor(in_n_x_wo_c_desc,
-                                            make_tuple(make_merge_transform(make_tuple(N, Wo)),
-                                                       make_merge_transform(make_tuple(X, C))),
-                                            make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            return in_gemmm_gemmk_desc;
-        }
-    }
-
-    template <typename ALayout,
-              typename std::enable_if<NDimSpatial == 2 &&
-                                          is_same_v<ALayout, tensor_layout::convolution::GNHWC>,
-                                      bool>::type = false>
-    static auto
-    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads)
-    {
-        const index_t N = a_g_n_c_wis_lengths[1];
-        const index_t C = a_g_n_c_wis_lengths[2];
-
-        const index_t Hi = a_g_n_c_wis_lengths[3];
-        const index_t Wi = a_g_n_c_wis_lengths[4];
-
-        const index_t Ho = c_g_n_k_wos_lengths[3];
-        const index_t Wo = c_g_n_k_wos_lengths[4];
-
-        const index_t ConvStrideH = conv_filter_strides[0];
-        const index_t ConvStrideW = conv_filter_strides[1];
-
-        if constexpr(ConvForwardSpecialization ==
-                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
-        {
-            const index_t NHoWo =
-                N * ck::accumulate_n<index_t>(
-                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
-
-            const auto in_gemmm_gemmk_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(NHoWo, C));
-
-            return in_gemmm_gemmk_desc;
-        }
-        else if constexpr(ConvForwardSpecialization ==
-                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
-        {
-            const auto in_n_hi_wi_c_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
-
-            const auto in_n_ho_wo_c_desc = transform_tensor_descriptor(
-                in_n_hi_wi_c_desc,
-                make_tuple(make_pass_through_transform(N),
-                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
-                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
-                           make_pass_through_transform(C)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_gemmm_gemmk_desc =
-                transform_tensor_descriptor(in_n_ho_wo_c_desc,
-                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
-                                                       make_pass_through_transform(C)),
-                                            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            return in_gemmm_gemmk_desc;
-        }
-        else
-        {
-            const index_t Y = b_g_k_c_xs_lengths[3];
-            const index_t X = b_g_k_c_xs_lengths[4];
-
-            const index_t ConvDilationH = conv_filter_dilations[0];
-            const index_t ConvDilationW = conv_filter_dilations[1];
-
-            const index_t InLeftPadH = input_left_pads[0];
-            const index_t InLeftPadW = input_left_pads[1];
-
-            const index_t InRightPadH = input_right_pads[0];
-            const index_t InRightPadW = input_right_pads[1];
-
-            const auto in_n_hi_wi_c_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
-
-            const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
-                in_n_hi_wi_c_desc,
-                make_tuple(make_pass_through_transform(N),
-                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                           make_pass_through_transform(C)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-            const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
-                in_n_hip_wip_c_desc,
-                make_tuple(
-                    make_pass_through_transform(N),
-                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                    make_pass_through_transform(C)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
-
-            const auto in_gemmm_gemmk_desc =
-                transform_tensor_descriptor(in_n_y_ho_x_wo_c_desc,
-                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
-                                                       make_merge_transform(make_tuple(Y, X, C))),
-                                            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            return in_gemmm_gemmk_desc;
-        }
-    }
-
-    template <typename ALayout,
-              typename std::enable_if<NDimSpatial == 3 &&
-                                          is_same_v<ALayout, tensor_layout::convolution::GNDHWC>,
-                                      bool>::type = false>
-    static auto
-    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
-                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
-                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
-                        const std::array<index_t, NDimSpatial>& input_left_pads,
-                        const std::array<index_t, NDimSpatial>& input_right_pads)
-    {
-        const index_t N = a_g_n_c_wis_lengths[1];
-        const index_t C = a_g_n_c_wis_lengths[2];
-
-        const index_t Di = a_g_n_c_wis_lengths[3];
-        const index_t Hi = a_g_n_c_wis_lengths[4];
-        const index_t Wi = a_g_n_c_wis_lengths[5];
-
-        const index_t Do = c_g_n_k_wos_lengths[3];
-        const index_t Ho = c_g_n_k_wos_lengths[4];
-        const index_t Wo = c_g_n_k_wos_lengths[5];
-
-        const index_t ConvStrideD = conv_filter_strides[0];
-        const index_t ConvStrideH = conv_filter_strides[1];
-        const index_t ConvStrideW = conv_filter_strides[2];
-
-        if constexpr(ConvForwardSpecialization ==
-                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
-        {
-            const index_t NDoHoWo =
-                N * ck::accumulate_n<index_t>(
-                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
-
-            const auto in_gemmm_gemmk_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(NDoHoWo, C));
-
-            return in_gemmm_gemmk_desc;
-        }
-        else if constexpr(ConvForwardSpecialization ==
-                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
-        {
-            const auto in_n_di_hi_wi_c_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
-
-            const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
-                in_n_di_hi_wi_c_desc,
-                make_tuple(make_pass_through_transform(N),
-                           make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
-                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
-                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
-                           make_pass_through_transform(C)),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-
-            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
-                in_n_do_ho_wo_c_desc,
-                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-                           make_pass_through_transform(C)),
-                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            return in_gemmm_gemmk_desc;
-        }
-        else
-        {
-            const index_t Z = b_g_k_c_xs_lengths[3];
-            const index_t Y = b_g_k_c_xs_lengths[4];
-            const index_t X = b_g_k_c_xs_lengths[5];
-
-            const index_t ConvDilationD = conv_filter_dilations[0];
-            const index_t ConvDilationH = conv_filter_dilations[1];
-            const index_t ConvDilationW = conv_filter_dilations[2];
-
-            const index_t InLeftPadD = input_left_pads[0];
-            const index_t InLeftPadH = input_left_pads[1];
-            const index_t InLeftPadW = input_left_pads[2];
-
-            const index_t InRightPadD = input_right_pads[0];
-            const index_t InRightPadH = input_right_pads[1];
-            const index_t InRightPadW = input_right_pads[2];
-
-            const auto in_n_di_hi_wi_c_desc =
-                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
-
-            const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
-                in_n_di_hi_wi_c_desc,
-                make_tuple(make_pass_through_transform(N),
-                           make_pad_transform(Di, InLeftPadD, InRightPadD),
-                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
-                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
-                           make_pass_through_transform(C)),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
-
-            const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
-                in_n_hip_wip_c_desc,
-                make_tuple(
-                    make_pass_through_transform(N),
-                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
-                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
-                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
-                    make_pass_through_transform(C)),
-                make_tuple(
-                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
-                make_tuple(Sequence<0>{},
-                           Sequence<1, 2>{},
-                           Sequence<3, 4>{},
-                           Sequence<5, 6>{},
-                           Sequence<7>{}));
-
-            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
-                in_n_z_do_y_ho_x_wo_c_desc,
-                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-                           make_merge_transform(make_tuple(Z, Y, X, C))),
-                make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            return in_gemmm_gemmk_desc;
-        }
-    }
-
     // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
     // properties
     template <typename ALayout,
               typename std::enable_if<NDimSpatial == 1 &&
                                           (is_same_v<ALayout, tensor_layout::convolution::G_NW_C> ||
-                                           is_same_v<ALayout, tensor_layout::convolution::NWGC>),
+                                           is_same_v<ALayout, tensor_layout::convolution::NWGC> ||
+                                           is_same_v<ALayout, tensor_layout::convolution::GNWC>),
                                       bool>::type = false>
     static auto
     MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
@@ -473,7 +138,8 @@ struct TransformConvFwdToGemm
     template <typename ALayout,
               typename std::enable_if<
                   NDimSpatial == 2 && (is_same_v<ALayout, tensor_layout::convolution::G_NHW_C> ||
-                                       is_same_v<ALayout, tensor_layout::convolution::NHWGC>),
+                                       is_same_v<ALayout, tensor_layout::convolution::NHWGC> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::GNHWC>),
                   bool>::type = false>
     static auto
     MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
@@ -601,7 +267,8 @@ struct TransformConvFwdToGemm
     template <typename ALayout,
               typename std::enable_if<
                   NDimSpatial == 3 && (is_same_v<ALayout, tensor_layout::convolution::G_NDHW_C> ||
-                                       is_same_v<ALayout, tensor_layout::convolution::NDHWGC>),
+                                       is_same_v<ALayout, tensor_layout::convolution::NDHWGC> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::GNDHWC>),
                   bool>::type = false>
     static auto
     MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 02d61f34ed..85756cd0d1 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -140,10 +140,36 @@ struct DynamicBuffer
         }
         else if constexpr(Op == InMemoryDataOperationEnum::Add)
         {
-            auto tmp = this->template Get<X>(i, is_valid_element);
-            this->template Set<X>(i, is_valid_element, x + tmp);
-            // tmp += x;
-            // this->template Set<X>(i, is_valid_element, tmp);
+            auto tmp       = this->template Get<X>(i, is_valid_element);
+            using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+            // handle bfloat addition
+            if constexpr(is_same_v<scalar_t, bhalf_t>)
+            {
+                if constexpr(is_scalar_type<X>::value)
+                {
+                    // Scalar type
+                    auto result =
+                        type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
+                    this->template Set<X>(i, is_valid_element, result);
+                }
+                else
+                {
+                    // Vector type
+                    constexpr auto vector_size = scalar_type<remove_cvref_t<X>>::vector_size;
+                    const vector_type<scalar_t, vector_size> a_vector{tmp};
+                    const vector_type<scalar_t, vector_size> b_vector{x};
+                    static_for<0, vector_size, 1>{}([&](auto idx) {
+                        auto result = type_convert<scalar_t>(
+                            type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
+                            type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
+                        this->template Set<scalar_t>(i + idx, is_valid_element, result);
+                    });
+                }
+            }
+            else
+            {
+                this->template Set<X>(i, is_valid_element, x + tmp);
+            }
         }
     }
 
diff --git a/include/ck/utility/is_detected.hpp b/include/ck/utility/is_detected.hpp
new file mode 100644
index 0000000000..31a4bf24d6
--- /dev/null
+++ b/include/ck/utility/is_detected.hpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+namespace detail {
+template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type    = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type    = Op<Args...>;
+};
+} // namespace detail
+
+struct nonesuch
+{
+    ~nonesuch()               = delete;
+    nonesuch(nonesuch const&) = delete;
+    void operator=(nonesuch const&) = delete;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t;
+
+} // namespace ck
diff --git a/include/ck/utility/tuple.hpp b/include/ck/utility/tuple.hpp
index b616b3123f..3ff0af89db 100644
--- a/include/ck/utility/tuple.hpp
+++ b/include/ck/utility/tuple.hpp
@@ -177,6 +177,8 @@ struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(X
     }
 
     __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    __host__ __device__ static constexpr bool IsTuple() { return true; }
 };
 
 template <>
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index 70619ee0a5..935926070d 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -9,8 +9,10 @@
 
 namespace ck {
 
-// Convert X to Y
-template <typename Y, typename X>
+// Convert X to Y, both X and Y are non-const data types.
+template <typename Y,
+          typename X,
+          std::enable_if_t<!(std::is_const_v<Y> || std::is_const_v<X>), bool> = false>
 __host__ __device__ constexpr Y type_convert(X x)
 {
     static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
@@ -18,6 +20,19 @@ __host__ __device__ constexpr Y type_convert(X x)
     return static_cast<Y>(x);
 }
 
+// Convert X to Y, either X or Y is a const data type.
+template <typename Y,
+          typename X,
+          std::enable_if_t<std::is_const_v<Y> || std::is_const_v<X>, bool> = false>
+__host__ __device__ constexpr Y type_convert(X x)
+{
+    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+
+    using NonConstY = std::remove_const_t<Y>;
+    using NonConstX = std::remove_const_t<X>;
+    return static_cast<Y>(type_convert<NonConstY, NonConstX>(x));
+}
+
 // convert bfp16 to fp32
 template <>
 inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
new file mode 100644
index 0000000000..7ccfc6eb77
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <type_traits>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+/**
+ * \brief Reference implementation for column to image.
+ *
+ * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
+ * Memory layout is the same.
+ * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
+ * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
+ *
+ * \tparam NDimSpatial Number of spatial dimensions.
+ * \tparam ImageLayout Image Layout.
+ * \tparam InDataType Input Data Type.
+ * \tparam OutDataType Output Data Type.
+ */
+template <ck::index_t NDimSpatial,
+          typename ImageLayout,
+          typename InDataType,
+          typename OutDataType,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct ReferenceColumnToImage : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        public:
+        Argument(const Tensor<InDataType>& input,
+                 Tensor<OutDataType>& output,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads)
+            : input_{input},
+              output_{output},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              filter_spatial_lengths_{filter_spatial_lengths}
+        {
+            initOutputSpatialLengths();
+        }
+
+        const Tensor<InDataType>& input_;
+        Tensor<OutDataType>& output_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> output_spatial_lengths_;
+
+        private:
+        void initOutputSpatialLengths()
+        {
+            constexpr auto input_offset_to_spatial = 3;
+
+            for(ck::index_t i = 0; i < NDimSpatial; ++i)
+            {
+                // XEff = (X - 1) * conv_dilation_w + 1;
+                // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+                const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_dilations_[i] + 1;
+
+                output_spatial_lengths_.push_back(
+                    (output_.GetLengths()[i + input_offset_to_spatial] + in_left_pads_[i] +
+                     in_right_pads_[i] - x_eff) /
+                        conv_strides_[i] +
+                    1);
+            }
+        }
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceColumnToImage::Argument;
+
+        float Run(const Argument& arg)
+        {
+            if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.input_.GetNumOfDimension() == 2))
+            {
+                throw std::runtime_error("wrong! inconsistent dimension");
+            }
+
+            const index_t N = arg.output_.GetLengths()[1];
+            const index_t C = arg.output_.GetLengths()[2];
+
+            if constexpr(NDimSpatial == 1)
+            {
+                const index_t Wo = arg.output_spatial_lengths_[0];
+                auto func        = [&](auto n) {
+                    for(index_t wo = 0; wo < Wo; ++wo)
+                    {
+                        index_t row    = n * Wo + wo;
+                        index_t column = 0;
+
+                        for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x)
+                        {
+                            auto wi = static_cast<ck::long_index_t>(wo * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+
+                            for(index_t c = 0; c < C; ++c)
+                            {
+                                if(wi >= 0 &&
+                                   ck::type_convert<std::size_t>(wi) < arg.output_.GetLengths()[3])
+                                {
+                                    float v_in  = ck::type_convert<float>(arg.input_(row, column));
+                                    float v_out = ck::type_convert<float>(arg.output_(0, n, c, wi));
+                                    arg.output_(0, n, c, wi) =
+                                        ck::type_convert<OutDataType>(v_in + v_out);
+                                }
+                                column++;
+                            }
+                        }
+                    }
+                };
+
+                make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 2)
+            {
+                const index_t Ho = arg.output_spatial_lengths_[0];
+                const index_t Wo = arg.output_spatial_lengths_[1];
+
+                auto func = [&](auto n) {
+                    for(index_t ho = 0; ho < Ho; ++ho)
+                    {
+                        for(index_t wo = 0; wo < Wo; ++wo)
+                        {
+                            index_t row    = n * Ho * Wo + ho * Wo + wo;
+                            index_t column = 0;
+
+                            for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y)
+                            {
+                                auto hi =
+                                    static_cast<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                    static_cast<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+
+                                for(index_t x = 0; x < arg.filter_spatial_lengths_[1]; ++x)
+                                {
+                                    auto wi =
+                                        static_cast<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                        static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                        static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+
+                                    for(index_t c = 0; c < C; ++c)
+                                    {
+
+                                        if(hi >= 0 &&
+                                           ck::type_convert<std::size_t>(hi) <
+                                               arg.output_.GetLengths()[3] &&
+                                           wi >= 0 &&
+                                           ck::type_convert<std::size_t>(wi) <
+                                               arg.output_.GetLengths()[4])
+                                        {
+                                            float v_in =
+                                                ck::type_convert<float>(arg.input_(row, column));
+                                            float v_out = ck::type_convert<float>(
+                                                arg.output_(0, n, c, hi, wi));
+                                            arg.output_(0, n, c, hi, wi) =
+                                                ck::type_convert<OutDataType>(v_in + v_out);
+                                        }
+                                        column++;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 3)
+            {
+                const index_t Do = arg.output_spatial_lengths_[0];
+                const index_t Ho = arg.output_spatial_lengths_[1];
+                const index_t Wo = arg.output_spatial_lengths_[2];
+
+                auto func = [&](auto n) {
+                    for(index_t d_o = 0; d_o < Do; ++d_o)
+                    {
+                        for(index_t ho = 0; ho < Ho; ++ho)
+                        {
+                            for(index_t wo = 0; wo < Wo; ++wo)
+                            {
+                                index_t row    = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo;
+                                index_t column = 0;
+
+                                for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z)
+                                {
+                                    auto di =
+                                        static_cast<ck::long_index_t>(d_o * arg.conv_strides_[0]) +
+                                        static_cast<ck::long_index_t>(z * arg.conv_dilations_[0]) -
+                                        static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+                                    for(index_t y = 0; y < arg.filter_spatial_lengths_[1]; ++y)
+                                    {
+                                        auto hi =
+                                            static_cast<ck::long_index_t>(ho *
+                                                                          arg.conv_strides_[1]) +
+                                            static_cast<ck::long_index_t>(y *
+                                                                          arg.conv_dilations_[1]) -
+                                            static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+                                        for(index_t x = 0; x < arg.filter_spatial_lengths_[2]; ++x)
+                                        {
+                                            auto wi =
+                                                static_cast<ck::long_index_t>(
+                                                    wo * arg.conv_strides_[2]) +
+                                                static_cast<ck::long_index_t>(
+                                                    x * arg.conv_dilations_[2]) -
+                                                static_cast<ck::long_index_t>(arg.in_left_pads_[2]);
+                                            for(index_t c = 0; c < C; ++c)
+                                            {
+                                                if(di >= 0 &&
+                                                   ck::type_convert<std::size_t>(di) <
+                                                       arg.output_.GetLengths()[3] &&
+                                                   hi >= 0 &&
+                                                   ck::type_convert<std::size_t>(hi) <
+                                                       arg.output_.GetLengths()[4] &&
+                                                   wi >= 0 &&
+                                                   ck::type_convert<std::size_t>(wi) <
+                                                       arg.output_.GetLengths()[5])
+                                                {
+                                                    float v_in = ck::type_convert<float>(
+                                                        arg.input_(row, column));
+                                                    float v_out = ck::type_convert<float>(
+                                                        arg.output_(0, n, c, di, hi, wi));
+                                                    arg.output_(0, n, c, di, hi, wi) =
+                                                        ck::type_convert<OutDataType>(v_in + v_out);
+                                                }
+                                                column++;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency());
+
+                return 0;
+            }
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        using namespace tensor_layout::convolution;
+
+        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+                       std::is_same_v<ImageLayout, GNDHWC>))
+        {
+            return false;
+        }
+        if constexpr(!(NDimSpatial >= 1 && NDimSpatial <= 3))
+        {
+            return false;
+        }
+        return true;
+    }
+
+    bool IsSupportedArgument(const Argument& arg)
+    {
+        const ck::index_t G = arg.output_.GetLengths()[0];
+        const ck::index_t N = arg.output_.GetLengths()[1];
+        const ck::index_t C = arg.output_.GetLengths()[2];
+
+        const index_t NDoHoWo =
+            N * ck::accumulate_n<index_t>(
+                    arg.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const index_t CZYX =
+            C * ck::accumulate_n<index_t>(
+                    arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+        if(!(arg.input_.GetLengths()[0] == static_cast<std::size_t>(NDoHoWo) &&
+             arg.input_.GetLengths()[1] == static_cast<std::size_t>(CZYX)))
+        {
+            return false;
+        }
+
+        if(G != 1)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const Tensor<InDataType>& input,
+                             Tensor<OutDataType>& output,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads)
+    {
+        return Argument{input,
+                        output,
+                        filter_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceColumnToImage"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
index 3f50ab88b3..9e12d07844 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
@@ -18,16 +18,18 @@ namespace host {
 /**
  * \brief Reference implementation for image to column.
  *
- * Tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
+ * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
  * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
+ * Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout.
+ * Memory layout is the same.
  *
  * \tparam NDimSpatial Number of spatial dimensions.
- * \tparam InputLayout Input Layout.
+ * \tparam ImageLayout Image Layout.
  * \tparam InDataType Input Data Type.
  * \tparam OutDataType Output Data Type.
  */
 template <ck::index_t NDimSpatial,
-          typename InputLayout,
+          typename ImageLayout,
           typename InDataType,
           typename OutDataType,
           typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
@@ -240,8 +242,8 @@ struct ReferenceImageToColumn : public device::BaseOperator
     {
         using namespace tensor_layout::convolution;
 
-        if constexpr(!(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
-                       std::is_same_v<InputLayout, GNDHWC>))
+        if constexpr(!(std::is_same_v<ImageLayout, GNWC> || std::is_same_v<ImageLayout, GNHWC> ||
+                       std::is_same_v<ImageLayout, GNDHWC>))
         {
             return false;
         }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
new file mode 100644
index 0000000000..57c0f88567
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::conv_tensor_rearrange_op;
+
+// Image to Column
+// nhwc, 1d
+void add_device_image_to_column_nwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// nhwc, 2d
+void add_device_image_to_column_nhwc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_nhwc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+// nhwc, 3d
+void add_device_image_to_column_ndhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
+        instances);
+
+void add_device_image_to_column_ndhwc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
+        instances);
+
+// Column to Image
+// nhwc, 1d
+void add_device_column_to_image_nwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// nhwc, 2d
+void add_device_column_to_image_nhwc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_nhwc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+// nhwc, 3d
+void add_device_column_to_image_ndhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
+        instances);
+
+void add_device_column_to_image_ndhwc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
+        instances);
+
+template <ck::index_t NumDimSpatial,
+          typename ImageLayout,
+          typename InDataType,
+          typename OutDataType,
+          typename ConvTensorRearrangeOp>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
+                                                            ImageLayout,
+                                                            InDataType,
+                                                            OutDataType,
+                                                            ConvTensorRearrangeOp>>
+{
+    using DeviceOp = DeviceConvTensorRearrange<NumDimSpatial,
+                                               ImageLayout,
+                                               InDataType,
+                                               OutDataType,
+                                               ConvTensorRearrangeOp>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+        {
+            if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, GNWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_nwc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_nwc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_nwc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_nwc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_image_to_column_ndhwc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_image_to_column_ndhwc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_image_to_column_ndhwc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_image_to_column_ndhwc_3d_i8_instances(op_ptrs);
+                }
+            }
+        }
+        else if constexpr(is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+        {
+            if constexpr(NumDimSpatial == 1 && is_same_v<ImageLayout, GNWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_nwc_1d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_nwc_1d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_nwc_1d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_nwc_1d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 2 && is_same_v<ImageLayout, GNHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_nhwc_2d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_nhwc_2d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_nhwc_2d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_nhwc_2d_i8_instances(op_ptrs);
+                }
+            }
+            else if constexpr(NumDimSpatial == 3 && is_same_v<ImageLayout, GNDHWC>)
+            {
+                if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
+                {
+                    add_device_column_to_image_ndhwc_3d_f32_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
+                {
+                    add_device_column_to_image_ndhwc_3d_f16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                                  is_same_v<OutDataType, ck::bhalf_t>)
+                {
+                    add_device_column_to_image_ndhwc_3d_bf16_instances(op_ptrs);
+                }
+                else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
+                {
+                    add_device_column_to_image_ndhwc_3d_i8_instances(op_ptrs);
+                }
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
new file mode 100644
index 0000000000..681f466677
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::tensor_layout::convolution;
+using namespace ck::conv_tensor_rearrange_op;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_bf16_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    16,    16,   S<8, 8>,     1>,
+
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_f16_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,    64,    16,    16,   S<8, 8>,     1>,
+        
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_f32_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,    64,    16,    16,   S<8, 8>,     1>,
+        
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial, typename InLayout>
+using device_column_to_image_i8_instances = std::tuple<
+    // clang-format off
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        // generic instance
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    16,    16,   S<8, 8>,     1>,
+
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
+        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
similarity index 70%
rename from library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp
rename to library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
index a2603218b2..74a2155a04 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
@@ -13,6 +13,7 @@ namespace device {
 namespace instance {
 
 using namespace ck::tensor_layout::convolution;
+using namespace ck::conv_tensor_rearrange_op;
 
 using BF16 = ck::bhalf_t;
 using F16  = ck::half_t;
@@ -28,17 +29,12 @@ using device_image_to_column_bf16_instances = std::tuple<
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
         //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    16,    16,   S<8, 8>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    32,    32,   S<8, 8>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    64,    64,   S<8, 8>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,    64,  S<8, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    32,    64,  S<8, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,   128,  S<8, 16>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
@@ -52,17 +48,13 @@ using device_image_to_column_f16_instances = std::tuple<
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
         //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance        
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    16,    16,   S<8, 8>,     1>,
+
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    32,    32,   S<8, 8>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    64,    64,   S<8, 8>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,    64,  S<8, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    32,    64,  S<8, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,   128,  S<8, 16>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
@@ -76,15 +68,11 @@ using device_image_to_column_f32_instances = std::tuple<
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
         //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance      
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,    16,    16,   S<8, 8>,     1>,
+
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,    32,    32,   S<8, 8>,     4>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    64,    64,  S<8, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
     // clang-format on
@@ -97,17 +85,13 @@ using device_image_to_column_i8_instances = std::tuple<
         //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
         //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
         //#####################|           |         |           |            |      |      |      |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,     8,     8,   S<8, 8>,     1>,
+        // generic instance
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    16,    16,   S<8, 8>,     1>,
+
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    32,    32,   S<8, 8>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    64,    64,   S<8, 8>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    16,    16,  S<8, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,    64,  S<8, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    32,    64,  S<8, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,   128,  S<8, 16>,     8>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    16,    16, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     1>,
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     1>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
         DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index 3ceea58c1a..7128a10fa1 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -312,6 +312,23 @@ void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
         DeviceGemm<Row, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
         instances);
 #endif
+#ifdef CK_ENABLE_FP8
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances);
+#endif
 template <typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -505,6 +522,32 @@ struct DeviceOperationInstanceFactory<
 #endif
             }
         }
+#endif
+#ifdef CK_ENABLE_FP8
+        else if constexpr(is_same_v<ADataType, ck::f8_t> && is_same_v<BDataType, ck::f8_t> &&
+                          is_same_v<CDataType, ck::f8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(op_ptrs);
+            }
+        }
 #endif
         return op_ptrs;
     }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp b/library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp
deleted file mode 100644
index 6c4526ba4e..0000000000
--- a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <vector>
-#include <memory>
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-// nhwc, 1d
-void add_device_image_to_column_nhwc_1d_bf16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, BF16, BF16>>>& instances);
-
-void add_device_image_to_column_nhwc_1d_f16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F16, F16>>>& instances);
-
-void add_device_image_to_column_nhwc_1d_f32_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F32, F32>>>& instances);
-
-void add_device_image_to_column_nhwc_1d_i8_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, int8_t, int8_t>>>& instances);
-// nhwc, 2d
-void add_device_image_to_column_nhwc_2d_bf16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, BF16, BF16>>>& instances);
-
-void add_device_image_to_column_nhwc_2d_f16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F16, F16>>>& instances);
-
-void add_device_image_to_column_nhwc_2d_f32_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F32, F32>>>& instances);
-
-void add_device_image_to_column_nhwc_2d_i8_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, int8_t, int8_t>>>& instances);
-// nhwc, 3d
-void add_device_image_to_column_nhwc_3d_bf16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, BF16, BF16>>>& instances);
-
-void add_device_image_to_column_nhwc_3d_f16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F16, F16>>>& instances);
-
-void add_device_image_to_column_nhwc_3d_f32_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F32, F32>>>& instances);
-
-void add_device_image_to_column_nhwc_3d_i8_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, int8_t, int8_t>>>& instances);
-
-template <ck::index_t NumDimSpatial, typename InLayout, typename InDataType, typename OutDataType>
-struct DeviceOperationInstanceFactory<
-    ck::tensor_operation::device::
-        DeviceImageToColumn<NumDimSpatial, InLayout, InDataType, OutDataType>>
-{
-    using DeviceOp = DeviceImageToColumn<NumDimSpatial, InLayout, InDataType, OutDataType>;
-
-    static auto GetInstances()
-    {
-        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-
-        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, GNWC>)
-        {
-            if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
-            {
-                add_device_image_to_column_nhwc_1d_f32_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
-            {
-                add_device_image_to_column_nhwc_1d_f16_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                              is_same_v<OutDataType, ck::bhalf_t>)
-            {
-                add_device_image_to_column_nhwc_1d_bf16_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
-            {
-                add_device_image_to_column_nhwc_1d_i8_instances(op_ptrs);
-            }
-        }
-        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC>)
-        {
-            if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
-            {
-                add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
-            {
-                add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                              is_same_v<OutDataType, ck::bhalf_t>)
-            {
-                add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
-            {
-                add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs);
-            }
-        }
-        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC>)
-        {
-            if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
-            {
-                add_device_image_to_column_nhwc_3d_f32_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
-            {
-                add_device_image_to_column_nhwc_3d_f16_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
-                              is_same_v<OutDataType, ck::bhalf_t>)
-            {
-                add_device_image_to_column_nhwc_3d_bf16_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<OutDataType, int8_t>)
-            {
-                add_device_image_to_column_nhwc_3d_i8_instances(op_ptrs);
-            }
-        }
-
-        return op_ptrs;
-    }
-};
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
new file mode 100644
index 0000000000..5d1a554524
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_instance_library(device_column_to_image_instance
+   device_column_to_image_nhwc_1d_instance.cpp
+   device_column_to_image_nhwc_2d_instance.cpp
+   device_column_to_image_nhwc_3d_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp
new file mode 100644
index 0000000000..8ba4d29775
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::conv_tensor_rearrange_op;
+
+void add_device_column_to_image_nwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_BF16
+    add_device_operation_instances(instances, device_column_to_image_bf16_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_nwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP16
+    add_device_operation_instances(instances, device_column_to_image_f16_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_nwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP32
+    add_device_operation_instances(instances, device_column_to_image_f32_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_nwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_INT8
+    add_device_operation_instances(instances, device_column_to_image_i8_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp
new file mode 100644
index 0000000000..4de665a63f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::conv_tensor_rearrange_op;
+
+void add_device_column_to_image_nhwc_2d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_BF16
+    add_device_operation_instances(instances, device_column_to_image_bf16_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_nhwc_2d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP16
+    add_device_operation_instances(instances, device_column_to_image_f16_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_nhwc_2d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP32
+    add_device_operation_instances(instances, device_column_to_image_f32_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_nhwc_2d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_INT8
+    add_device_operation_instances(instances, device_column_to_image_i8_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp
new file mode 100644
index 0000000000..9762b46c43
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::conv_tensor_rearrange_op;
+
+void add_device_column_to_image_ndhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_BF16
+    add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_ndhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP16
+    add_device_operation_instances(instances, device_column_to_image_f16_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_ndhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP32
+    add_device_operation_instances(instances, device_column_to_image_f32_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+void add_device_column_to_image_ndhwc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ColumnToImage>>>&
+        instances)
+{
+#ifdef CK_ENABLE_INT8
+    add_device_operation_instances(instances, device_column_to_image_i8_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index 22cf1a462d..89820775c3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -89,6 +89,11 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_ins
                            device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
                            device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp)
 
+list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
+                           device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
+                           device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
+                           device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
+
 add_instance_library(device_gemm_instance ${GEMM_INSTANCES})
 
 set(ENABLE_PIPELINE_V2_OPT OFF)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
new file mode 100644
index 0000000000..9739046d3c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#ifdef CK_ENABLE_FP8
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+using F8  = f8_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,   4,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,   4,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,   4,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,   4,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f8_f8_f8_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
new file mode 100644
index 0000000000..810c1b87cb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#ifdef CK_ENABLE_FP8
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+using F8  = f8_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,   4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,   4,  16,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,   4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,   4,  16,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,   4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,   4,  16,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,   4,  16,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,  16,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f8_f8_f8_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
new file mode 100644
index 0000000000..e1aac45134
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_kn_mn_instance.cpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#ifdef CK_ENABLE_FP8
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+using F8  = f8_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f8_f8_f8_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
new file mode 100644
index 0000000000..5c557312aa
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_fp8_fp8_fp8_mk_nk_mn_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#ifdef CK_ENABLE_FP8
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+using F8  = f8_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,     F8,     F8,     F8,     F32,       F8, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F8, F8, F8, PassThrough, PassThrough, PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f8_f8_f8_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
index c8463623c3..3a629f2346 100644
--- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 
 namespace ck {
@@ -9,28 +9,50 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_image_to_column_nhwc_1d_bf16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, BF16, BF16>>>& instances)
+using namespace ck::conv_tensor_rearrange_op;
+
+void add_device_image_to_column_nwc_1d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, BF16, BF16, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_BF16
     add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
-void add_device_image_to_column_nhwc_1d_f16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F16, F16>>>& instances)
+void add_device_image_to_column_nwc_1d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F16, F16, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_FP16
     add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
-void add_device_image_to_column_nhwc_1d_f32_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, F32, F32>>>& instances)
+void add_device_image_to_column_nwc_1d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, F32, F32, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_FP32
     add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
-void add_device_image_to_column_nhwc_1d_i8_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<1, GNWC, int8_t, int8_t>>>& instances)
+void add_device_image_to_column_nwc_1d_i8_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<1, GNWC, int8_t, int8_t, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_INT8
     add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
index 652c7fac2a..7115e75667 100644
--- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 
 namespace ck {
@@ -9,28 +9,51 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
+using namespace ck::conv_tensor_rearrange_op;
+
 void add_device_image_to_column_nhwc_2d_bf16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, BF16, BF16>>>& instances)
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, BF16, BF16, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_BF16
     add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
 void add_device_image_to_column_nhwc_2d_f16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F16, F16>>>& instances)
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F16, F16, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_FP16
     add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
 void add_device_image_to_column_nhwc_2d_f32_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, F32, F32>>>& instances)
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, F32, F32, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_FP32
     add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
 void add_device_image_to_column_nhwc_2d_i8_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<2, GNHWC, int8_t, int8_t>>>& instances)
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<2, GNHWC, int8_t, int8_t, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_INT8
     add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
index 07774504d7..8290dae928 100644
--- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
-#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 
 namespace ck {
@@ -9,28 +9,51 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_image_to_column_nhwc_3d_bf16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, BF16, BF16>>>& instances)
+using namespace ck::conv_tensor_rearrange_op;
+
+void add_device_image_to_column_ndhwc_3d_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, BF16, BF16, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_BF16
     add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
-void add_device_image_to_column_nhwc_3d_f16_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F16, F16>>>& instances)
+void add_device_image_to_column_ndhwc_3d_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F16, F16, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_FP16
     add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
-void add_device_image_to_column_nhwc_3d_f32_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, F32, F32>>>& instances)
+void add_device_image_to_column_ndhwc_3d_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, F32, F32, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_FP32
     add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
-void add_device_image_to_column_nhwc_3d_i8_instances(
-    std::vector<std::unique_ptr<DeviceImageToColumn<3, GNDHWC, int8_t, int8_t>>>& instances)
+void add_device_image_to_column_ndhwc_3d_i8_instances(
+    std::vector<
+        std::unique_ptr<DeviceConvTensorRearrange<3, GNDHWC, int8_t, int8_t, ImageToColumn>>>&
+        instances)
 {
+#ifdef CK_ENABLE_INT8
     add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{});
+#else
+    ignore = instances;
+#endif
 }
 
 } // namespace instance
diff --git a/profiler/README.md b/profiler/README.md
index d03bfa7fc4..d081fc33e9 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -185,7 +185,7 @@ GB/s: 69.2301
 ```
 Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
 
-## Profile image to column kernels
+## Profile image to column/column to image kernels
 ```bash
 # arg1: tensor operation (" OP_NAME ": " OP_DESC ")
 # arg2: data type (0: Input fp32, Weight fp32, Output fp32
@@ -197,6 +197,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
 # arg5: initialization (0: no init, 1: integer value, 2: decimal value)
 # arg6: print tensor value (0: no; 1: yes)
 # arg7: time kernel (0: no, 1: yes)
+# arg8: operation type (0: ImageToColumn, 1: ColumnToImage)
 # Following arguments (depending on number of spatial dims):
 #  Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
 #  G, N, K, C, 
@@ -207,8 +208,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
 #  <left padding>, (ie LeftPy, LeftPx for 2D)
 #  <right padding>, (ie RightPy, RightPx for 2D)
 
- ################             op   datatype  layout  verify  init  log  time  Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
-./bin/ckProfiler image_to_column          0       0       1     1    0     1      2  1 256   1 512  3  3   28  28   1   1   1   1        0       0       0        0
+ ################                   op   datatype  layout  verify  init  log  time opType Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
+./bin/ckProfiler conv_tensor_rearrange          0       0       0     1    0     1      0     2  1 256   1 512  3  3   28  28   1   1   1   1        0       0       0        0
 
  ```
 
@@ -222,3 +223,4 @@ name: DeviceImageToColumn<128, 32, 64, 4>
 avg_time: 3.12326
 GB/s: 2042.59
 ```
+Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
diff --git a/profiler/include/profiler/profile_image_to_column_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
similarity index 56%
rename from profiler/include/profiler/profile_image_to_column_impl.hpp
rename to profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
index cc929e9220..dc3d748bf6 100644
--- a/profiler/include/profiler/profile_image_to_column_impl.hpp
+++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp
@@ -9,9 +9,11 @@
 #include <limits>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
-#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
+#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -19,22 +21,88 @@
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp"
 
 namespace ck {
 namespace profiler {
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
+using namespace conv_tensor_rearrange_op;
+
+template <typename InputDataType, typename ConvTensorRearrangeOp>
+Tensor<InputDataType> create_input(const HostTensorDescriptor& image_desc,
+                                   const HostTensorDescriptor& gemm_desc)
+{
+    if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+    {
+        Tensor<InputDataType> input(image_desc);
+        return input;
+    }
+    else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+    {
+        Tensor<InputDataType> input(gemm_desc);
+        return input;
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported op!");
+    }
+}
+
+template <typename OutputDataType, typename ConvTensorRearrangeOp>
+Tensor<OutputDataType> create_output(const HostTensorDescriptor& image_desc,
+                                     const HostTensorDescriptor& gemm_desc)
+{
+    if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+    {
+        Tensor<OutputDataType> output(gemm_desc);
+        return output;
+    }
+    else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+    {
+        Tensor<OutputDataType> output(image_desc);
+        return output;
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported op!");
+    }
+}
 
 template <index_t NDimSpatial,
           typename InputLayout,
           typename InputDataType,
-          typename OutputDataType>
-bool profile_image_to_column_impl(int do_verification,
-                                  int init_method,
-                                  bool do_log,
-                                  bool time_kernel,
-                                  const ck::utils::conv::ConvParam& conv_param)
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+static auto make_ref_op()
+{
+    if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+    {
+        return ck::tensor_operation::host::
+            ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
+    }
+    else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+    {
+        return ck::tensor_operation::host::
+            ReferenceColumnToImage<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported op!");
+    }
+}
+
+template <index_t NDimSpatial,
+          typename InputLayout,
+          typename InputDataType,
+          typename OutputDataType,
+          typename ConvTensorRearrangeOp>
+bool profile_conv_tensor_rearrange_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        const ck::utils::conv::ConvParam& conv_param)
 {
     const ck::index_t NDoHoWo =
         conv_param.N_ *
@@ -45,16 +113,16 @@ bool profile_image_to_column_impl(int do_verification,
         ck::accumulate_n<ck::index_t>(
             conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
 
-    const auto in_desc =
+    const auto image_desc =
         ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
             conv_param);
-    const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+    const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
 
     std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
     std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
     std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
-    std::array<ck::index_t, 2> output_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> image_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> gemm_m_k_strides{};
     std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
     std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
     std::array<ck::index_t, NDimSpatial> input_left_pads{};
@@ -65,16 +133,19 @@ bool profile_image_to_column_impl(int do_verification,
     copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
     copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
     copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
-    copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
-    copy(out_desc.GetStrides(), output_m_k_strides);
+    copy(image_desc.GetStrides(), image_g_n_c_wis_strides);
+    copy(gemm_desc.GetStrides(), gemm_m_k_strides);
     copy(conv_param.conv_filter_strides_, conv_filter_strides);
     copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
     copy(conv_param.input_left_pads_, input_left_pads);
     copy(conv_param.input_right_pads_, input_right_pads);
 
-    Tensor<InputDataType> input(in_desc);
-    Tensor<OutputDataType> host_output(out_desc);
-    Tensor<OutputDataType> device_output(out_desc);
+    Tensor<InputDataType> input =
+        create_input<InputDataType, ConvTensorRearrangeOp>(image_desc, gemm_desc);
+    Tensor<OutputDataType> device_output =
+        create_output<OutputDataType, ConvTensorRearrangeOp>(image_desc, gemm_desc);
+    Tensor<OutputDataType> host_output =
+        create_output<OutputDataType, ConvTensorRearrangeOp>(image_desc, gemm_desc);
 
     std::cout << "input: " << input.mDesc << std::endl;
     std::cout << "output: " << host_output.mDesc << std::endl;
@@ -94,17 +165,21 @@ bool profile_image_to_column_impl(int do_verification,
     // run reference op
     if(do_verification)
     {
-        auto ref_image_to_column = ck::tensor_operation::host::
-            ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
+        auto ref_conv_tensor_rearrange = make_ref_op<NDimSpatial,
+                                                     InputLayout,
+                                                     InputDataType,
+                                                     OutputDataType,
+                                                     ConvTensorRearrangeOp>();
 
-        auto ref_invoker  = ref_image_to_column.MakeInvoker();
-        auto ref_argument = ref_image_to_column.MakeArgument(input,
-                                                             host_output,
-                                                             conv_param.filter_spatial_lengths_,
-                                                             conv_param.conv_filter_strides_,
-                                                             conv_param.conv_filter_dilations_,
-                                                             conv_param.input_left_pads_,
-                                                             conv_param.input_right_pads_);
+        auto ref_invoker = ref_conv_tensor_rearrange.MakeInvoker();
+        auto ref_argument =
+            ref_conv_tensor_rearrange.MakeArgument(input,
+                                                   host_output,
+                                                   conv_param.filter_spatial_lengths_,
+                                                   conv_param.conv_filter_strides_,
+                                                   conv_param.conv_filter_dilations_,
+                                                   conv_param.input_left_pads_,
+                                                   conv_param.input_right_pads_);
 
         // init host output to zero
         host_output.SetZero();
@@ -112,8 +187,11 @@ bool profile_image_to_column_impl(int do_verification,
         ref_invoker.Run(ref_argument);
     }
 
-    using DeviceOp = ck::tensor_operation::device::
-        DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>;
+    using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NDimSpatial,
+                                                                             InputLayout,
+                                                                             InputDataType,
+                                                                             OutputDataType,
+                                                                             ConvTensorRearrangeOp>;
 
     // get device op instances
     const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -139,8 +217,8 @@ bool profile_image_to_column_impl(int do_verification,
             input_spatial_lengths,
             filter_spatial_lengths,
             output_spatial_lengths,
-            input_g_n_c_wis_strides,
-            output_m_k_strides,
+            image_g_n_c_wis_strides,
+            gemm_m_k_strides,
             conv_filter_strides,
             conv_filter_dilations,
             input_left_pads,
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
index eaab5dbcc2..ccebb020c4 100644
--- a/profiler/include/profiler/profile_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -223,6 +223,12 @@ int profile_gemm_impl(int do_verification,
     {
         std::cout << "Best Perf for datatype = int8";
     }
+#if defined CK_ENABLE_FP8
+    else if constexpr(is_same<CDataType, f8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = fp8";
+    }
+#endif
 
     if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
     {
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 7da7613f26..8d9a47cd6b 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -28,7 +28,7 @@ set(PROFILER_SOURCES
     profile_contraction_bilinear.cpp
     profile_contraction_scale.cpp
     profile_grouped_conv_bwd_data.cpp
-    profile_image_to_column.cpp
+    profile_conv_tensor_rearrange.cpp
 )
 if(DL_KERNELS)
   list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
@@ -84,6 +84,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instanc
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
 if(DL_KERNELS)
   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 endif()
diff --git a/profiler/src/profile_conv_tensor_rearrange.cpp b/profiler/src/profile_conv_tensor_rearrange.cpp
new file mode 100644
index 0000000000..bad5ce40a5
--- /dev/null
+++ b/profiler/src/profile_conv_tensor_rearrange.cpp
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_conv_tensor_rearrange_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct RearrangeOp
+{
+    ImageToColumn, // 0
+    ColumnToImage, // 1
+};
+
+enum struct ConvLayout
+{
+    NHWC, // 0
+};
+
+enum struct DataType
+{
+    F32_F32,   // 0
+    F16_F16,   // 1
+    BF16_BF16, // 2
+    INT8_INT8, // 3
+};
+
+#define OP_NAME "conv_tensor_rearrange"
+#define OP_DESC "Conv Tensor Rearrange"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8)\n"
+        << "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << "arg8: operation type (0: ImageToColumn, 1: ColumnToImage)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+} // namespace
+
+int profile_conv_tensor_rearrange(int argc, char* argv[])
+{
+    // 9 for control, 1 for num_dim_spatial
+    if(argc < 10)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<DataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const auto rearrange_op    = static_cast<RearrangeOp>(std::stoi(argv[8]));
+    const int num_dim_spatial  = std::stoi(argv[9]);
+
+    // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using INT8 = int8_t;
+
+    using namespace ck::tensor_layout::convolution;
+    using namespace ck::conv_tensor_rearrange_op;
+
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto in_type,
+                       auto out_type,
+                       auto rearrange_op_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout = decltype(in_layout);
+
+        using InDataType  = decltype(in_type);
+        using OutDataType = decltype(out_type);
+
+        using Op = decltype(rearrange_op_type);
+
+        bool pass = ck::profiler::
+            profile_conv_tensor_rearrange_impl<NDimSpatial, InLayout, InDataType, OutDataType, Op>(
+                do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    // Image To Column
+    if(rearrange_op == RearrangeOp::ImageToColumn)
+    {
+        // NHWC
+        if(layout == ConvLayout::NHWC)
+        {
+            if(num_dim_spatial == 1)
+            {
+                if(data_type == DataType::F32_F32)
+                {
+                    return profile(I1, GNWC{}, F32{}, F32{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::F16_F16)
+                {
+                    return profile(I1, GNWC{}, F16{}, F16{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::BF16_BF16)
+                {
+                    return profile(I1, GNWC{}, BF16{}, BF16{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::INT8_INT8)
+                {
+                    return profile(I1, GNWC{}, INT8{}, INT8{}, ImageToColumn{});
+                }
+            }
+            else if(num_dim_spatial == 2)
+            {
+                if(data_type == DataType::F32_F32)
+                {
+                    return profile(I2, GNHWC{}, F32{}, F32{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::F16_F16)
+                {
+                    return profile(I2, GNHWC{}, F16{}, F16{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::BF16_BF16)
+                {
+                    return profile(I2, GNHWC{}, BF16{}, BF16{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::INT8_INT8)
+                {
+                    return profile(I2, GNHWC{}, INT8{}, INT8{}, ImageToColumn{});
+                }
+            }
+            else if(num_dim_spatial == 3)
+            {
+                if(data_type == DataType::F32_F32)
+                {
+                    return profile(I3, GNDHWC{}, F32{}, F32{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::F16_F16)
+                {
+                    return profile(I3, GNDHWC{}, F16{}, F16{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::BF16_BF16)
+                {
+                    return profile(I3, GNDHWC{}, BF16{}, BF16{}, ImageToColumn{});
+                }
+                else if(data_type == DataType::INT8_INT8)
+                {
+                    return profile(I3, GNDHWC{}, INT8{}, INT8{}, ImageToColumn{});
+                }
+            }
+        }
+    }
+    else if(rearrange_op == RearrangeOp::ColumnToImage)
+    {
+        // NHWC
+        if(layout == ConvLayout::NHWC)
+        {
+            if(num_dim_spatial == 1)
+            {
+                if(data_type == DataType::F32_F32)
+                {
+                    return profile(I1, GNWC{}, F32{}, F32{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::F16_F16)
+                {
+                    return profile(I1, GNWC{}, F16{}, F16{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::BF16_BF16)
+                {
+                    return profile(I1, GNWC{}, BF16{}, BF16{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::INT8_INT8)
+                {
+                    return profile(I1, GNWC{}, INT8{}, INT8{}, ColumnToImage{});
+                }
+            }
+            else if(num_dim_spatial == 2)
+            {
+                if(data_type == DataType::F32_F32)
+                {
+                    return profile(I2, GNHWC{}, F32{}, F32{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::F16_F16)
+                {
+                    return profile(I2, GNHWC{}, F16{}, F16{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::BF16_BF16)
+                {
+                    return profile(I2, GNHWC{}, BF16{}, BF16{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::INT8_INT8)
+                {
+                    return profile(I2, GNHWC{}, INT8{}, INT8{}, ColumnToImage{});
+                }
+            }
+            else if(num_dim_spatial == 3)
+            {
+                if(data_type == DataType::F32_F32)
+                {
+                    return profile(I3, GNDHWC{}, F32{}, F32{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::F16_F16)
+                {
+                    return profile(I3, GNDHWC{}, F16{}, F16{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::BF16_BF16)
+                {
+                    return profile(I3, GNDHWC{}, BF16{}, BF16{}, ColumnToImage{});
+                }
+                else if(data_type == DataType::INT8_INT8)
+                {
+                    return profile(I3, GNDHWC{}, INT8{}, INT8{}, ColumnToImage{});
+                }
+            }
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_tensor_rearrange);
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 9ca7fc4c88..df243c96d6 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -23,6 +23,7 @@ enum struct GemmDataType
     F16_F16_F16,    // 1
     BF16_BF16_BF16, // 2
     INT8_INT8_INT8, // 3
+    F8_F8_F8,       // 4
 };
 
 #define OP_NAME "gemm"
@@ -31,7 +32,7 @@ enum struct GemmDataType
 static void print_helper_msg()
 {
     std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
-              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
+              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: fp8)\n"
               << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
               << "                     1: A[m, k] * B[n, k] = C[m, n];\n"
               << "                     2: A[k, m] * B[k, n] = C[m, n];\n"
@@ -76,6 +77,9 @@ int profile_gemm(int argc, char* argv[])
     using INT8  = int8_t;
     using INT32 = int32_t;
 #endif
+#ifdef CK_ENABLE_FP8
+    using F8 = ck::f8_t;
+#endif
 
     using Row = ck::tensor_layout::gemm::RowMajor;
     using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -194,6 +198,24 @@ int profile_gemm(int argc, char* argv[])
     {
         return profile(Col{}, Col{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
     }
+#endif
+#ifdef CK_ENABLE_FP8
+    else if(data_type == GemmDataType::F8_F8_F8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F8{}, F8{}, F32{}, F8{});
+    }
+    else if(data_type == GemmDataType::F8_F8_F8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F8{}, F8{}, F32{}, F8{});
+    }
+    else if(data_type == GemmDataType::F8_F8_F8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F8{}, F8{}, F32{}, F8{});
+    }
+    else if(data_type == GemmDataType::F8_F8_F8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F8{}, F8{}, F32{}, F8{});
+    }
 #endif
     else
     {
diff --git a/profiler/src/profile_image_to_column.cpp b/profiler/src/profile_image_to_column.cpp
deleted file mode 100644
index bf4312a6cf..0000000000
--- a/profiler/src/profile_image_to_column.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-
-#include "profiler/profile_image_to_column_impl.hpp"
-#include "profiler_operation_registry.hpp"
-
-namespace {
-
-enum struct ConvLayout
-{
-    NHWC, // 0
-};
-
-enum struct DataType
-{
-    F32_F32,   // 0
-    F16_F16,   // 1
-    BF16_BF16, // 2
-    INT8_INT8, // 3
-};
-
-#define OP_NAME "image_to_column"
-#define OP_DESC "Image To Column"
-
-static void print_helper_msg()
-{
-    std::cout
-        // clang-format off
-        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
-        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
-        << "                 1: Input fp16, Weight fp16, Output fp16\n"
-        << "                 2: Input bf16, Weight bf16, Output bf16\n"
-        << "                 3: Input int8, Weight int8, Output int8)\n"
-        << "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n"
-        << "arg4: verification (0: no, 1: yes)\n"
-        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
-        << "arg6: print tensor value (0: no; 1: yes)\n"
-        << "arg7: time kernel (0: no, 1: yes)\n"
-        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
-    // clang-format on
-}
-
-} // namespace
-
-int profile_image_to_column(int argc, char* argv[])
-{
-    // 8 for control, 1 for num_dim_spatial
-    if(argc < 9)
-    {
-        print_helper_msg();
-        return 1;
-    }
-
-    const auto data_type       = static_cast<DataType>(std::stoi(argv[2]));
-    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
-    const bool do_verification = std::stoi(argv[4]);
-    const int init_method      = std::stoi(argv[5]);
-    const bool do_log          = std::stoi(argv[6]);
-    const bool time_kernel     = std::stoi(argv[7]);
-    const int num_dim_spatial  = std::stoi(argv[8]);
-
-    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
-    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
-    {
-        print_helper_msg();
-        return 1;
-    }
-
-    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
-
-    using F32  = float;
-    using F16  = ck::half_t;
-    using BF16 = ck::bhalf_t;
-    using INT8 = int8_t;
-
-    using namespace ck::tensor_layout::convolution;
-
-    constexpr auto I1 = ck::Number<1>{};
-    constexpr auto I2 = ck::Number<2>{};
-    constexpr auto I3 = ck::Number<3>{};
-
-    auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, auto in_type, auto out_type) {
-        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
-
-        using InLayout = decltype(in_layout);
-
-        using InDataType  = decltype(in_type);
-        using OutDataType = decltype(out_type);
-
-        bool pass = ck::profiler::
-            profile_image_to_column_impl<NDimSpatial, InLayout, InDataType, OutDataType>(
-                do_verification, init_method, do_log, time_kernel, params);
-
-        return pass ? 0 : 1;
-    };
-
-    // NHWC
-    if(layout == ConvLayout::NHWC)
-    {
-        if(num_dim_spatial == 1)
-        {
-            if(data_type == DataType::F32_F32)
-            {
-                return profile(I1, GNWC{}, F32{}, F32{});
-            }
-            else if(data_type == DataType::F16_F16)
-            {
-                return profile(I1, GNWC{}, F16{}, F16{});
-            }
-            else if(data_type == DataType::BF16_BF16)
-            {
-                return profile(I1, GNWC{}, BF16{}, BF16{});
-            }
-            else if(data_type == DataType::INT8_INT8)
-            {
-                return profile(I1, GNWC{}, INT8{}, INT8{});
-            }
-        }
-        else if(num_dim_spatial == 2)
-        {
-            if(data_type == DataType::F32_F32)
-            {
-                return profile(I2, GNHWC{}, F32{}, F32{});
-            }
-            else if(data_type == DataType::F16_F16)
-            {
-                return profile(I2, GNHWC{}, F16{}, F16{});
-            }
-            else if(data_type == DataType::BF16_BF16)
-            {
-                return profile(I2, GNHWC{}, BF16{}, BF16{});
-            }
-            else if(data_type == DataType::INT8_INT8)
-            {
-                return profile(I2, GNHWC{}, INT8{}, INT8{});
-            }
-        }
-        else if(num_dim_spatial == 3)
-        {
-            if(data_type == DataType::F32_F32)
-            {
-                return profile(I3, GNDHWC{}, F32{}, F32{});
-            }
-            else if(data_type == DataType::F16_F16)
-            {
-                return profile(I3, GNDHWC{}, F16{}, F16{});
-            }
-            else if(data_type == DataType::BF16_BF16)
-            {
-                return profile(I3, GNDHWC{}, BF16{}, BF16{});
-            }
-            else if(data_type == DataType::INT8_INT8)
-            {
-                return profile(I3, GNDHWC{}, INT8{}, INT8{});
-            }
-        }
-    }
-
-    std::cout << "this data_type & layout is not implemented" << std::endl;
-
-    return 1;
-}
-
-REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_image_to_column);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 368965aa8a..07dd675afc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -155,7 +155,7 @@ add_subdirectory(contraction)
 add_subdirectory(pool)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
-add_subdirectory(image_to_column)
+add_subdirectory(conv_tensor_rearrange)
 if(GPU_TARGETS MATCHES "gfx11")
     add_subdirectory(wmma_op)
 endif()
diff --git a/test/conv_tensor_rearrange/CMakeLists.txt b/test/conv_tensor_rearrange/CMakeLists.txt
new file mode 100644
index 0000000000..f6ad263242
--- /dev/null
+++ b/test/conv_tensor_rearrange/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_gtest_executable(test_conv_tensor_rearrange test_conv_tensor_rearrange.cpp)
+target_link_libraries(test_conv_tensor_rearrange PRIVATE utility device_image_to_column_instance device_column_to_image_instance)
+add_gtest_executable(test_conv_tensor_rearrange_interface test_conv_tensor_rearrange_interface.cpp)
+target_link_libraries(test_conv_tensor_rearrange_interface PRIVATE utility)
diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
new file mode 100644
index 0000000000..7065b03e0d
--- /dev/null
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_conv_tensor_rearrange_impl.hpp"
+
+template <typename Tuple>
+class TestConvTensorRearrange : public ::testing::Test
+{
+    protected:
+    using ImLayout              = std::tuple_element_t<0, Tuple>;
+    using ConvTensorRearrangeOp = std::tuple_element_t<1, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial, typename InDataType, typename OutDataType>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl<NDimSpatial,
+                                                                            ImLayout,
+                                                                            InDataType,
+                                                                            OutDataType,
+                                                                            ConvTensorRearrangeOp>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+using namespace ck::conv_tensor_rearrange_op;
+
+using KernelTypes1d =
+    ::testing::Types<std::tuple<GNWC, ImageToColumn>, std::tuple<GNWC, ColumnToImage>>;
+
+using KernelTypes2d =
+    ::testing::Types<std::tuple<GNHWC, ImageToColumn>, std::tuple<GNHWC, ColumnToImage>>;
+
+using KernelTypes3d =
+    ::testing::Types<std::tuple<GNDHWC, ImageToColumn>, std::tuple<GNDHWC, ColumnToImage>>;
+
+template <typename Tuple>
+class TestConvTensorRearrange1d : public TestConvTensorRearrange<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestConvTensorRearrange2d : public TestConvTensorRearrange<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestConvTensorRearrange3d : public TestConvTensorRearrange<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestConvTensorRearrange1d, KernelTypes1d);
+TYPED_TEST_SUITE(TestConvTensorRearrange2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestConvTensorRearrange3d, KernelTypes3d);
+
+TYPED_TEST(TestConvTensorRearrange1d, Test1D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
+    // ScalarPerVector should be 1
+    this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
+    // stride != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
+    // dilation != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
+#ifdef CK_ENABLE_FP32
+    this->template Run<1, float, float>();
+#endif
+#ifdef CK_ENABLE_BF16
+    this->template Run<1, ck::bhalf_t, ck::bhalf_t>();
+#endif
+#ifdef CK_ENABLE_FP16
+    this->template Run<1, ck::half_t, ck::half_t>();
+#endif
+#ifdef CK_ENABLE_INT8
+    this->template Run<1, int8_t, int8_t>();
+#endif
+}
+
+TYPED_TEST(TestConvTensorRearrange2d, Test2D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
+#ifdef CK_ENABLE_FP32
+    this->template Run<2, float, float>();
+#endif
+#ifdef CK_ENABLE_BF16
+    this->template Run<2, ck::bhalf_t, ck::bhalf_t>();
+#endif
+#ifdef CK_ENABLE_FP16
+    this->template Run<2, ck::half_t, ck::half_t>();
+#endif
+#ifdef CK_ENABLE_INT8
+    this->template Run<2, int8_t, int8_t>();
+#endif
+}
+
+TYPED_TEST(TestConvTensorRearrange3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}});
+#ifdef CK_ENABLE_FP32
+    this->template Run<3, float, float>();
+#endif
+#ifdef CK_ENABLE_BF16
+    this->template Run<3, ck::bhalf_t, ck::bhalf_t>();
+#endif
+#ifdef CK_ENABLE_FP16
+    this->template Run<3, ck::half_t, ck::half_t>();
+#endif
+#ifdef CK_ENABLE_INT8
+    this->template Run<3, int8_t, int8_t>();
+#endif
+}
diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
new file mode 100644
index 0000000000..57fcdc4269
--- /dev/null
+++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp"
+#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using DataType = float;
+using ImLayout = ck::tensor_layout::convolution::GNWC;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using namespace ck::conv_tensor_rearrange_op;
+
+template <ck::index_t ScalarPerVector, bool IsCPacked>
+class TestConvTensorRearrangeInterface : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 1;
+
+    // clang-format off
+    using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
+        //        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //           |         |           |            |      |      |      |          |       |
+        < NDimSpatial, ImLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
+    using DeviceColToimgInstance = ck::tensor_operation::device::DeviceColumnToImageImpl
+        //        Num| ImLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //           |         |           |            |      |      |      |          |       |
+        < NDimSpatial, ImLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+
+    template <typename ConvTensorRearrangeOp>
+    bool Run()
+    {
+
+        const auto N = conv_param.N_;
+        const auto C = conv_param.C_;
+        const auto FakeC =
+            conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
+
+        const ck::index_t NDoHoWo =
+            N *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const ck::index_t CZYX =
+            C *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+        const auto image_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(
+                conv_param);
+        const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+        std::array<ck::index_t, 2> output_m_k_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+        copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
+        copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
+        copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
+        copy(image_desc.GetStrides(), input_g_n_c_wis_strides);
+        copy(gemm_desc.GetStrides(), output_m_k_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+
+        if constexpr(std::is_same_v<ConvTensorRearrangeOp, ImageToColumn>)
+        {
+            auto img2col  = DeviceImgToColInstance{};
+            auto argument = img2col.MakeArgument(nullptr,
+                                                 nullptr,
+                                                 N,
+                                                 IsCPacked ? C : FakeC,
+                                                 input_spatial_lengths,
+                                                 filter_spatial_lengths,
+                                                 output_spatial_lengths,
+                                                 input_g_n_c_wis_strides,
+                                                 output_m_k_strides,
+                                                 conv_filter_strides,
+                                                 conv_filter_dilations,
+                                                 input_left_pads,
+                                                 input_right_pads);
+
+            return img2col.IsSupportedArgument(argument);
+        }
+        else if constexpr(std::is_same_v<ConvTensorRearrangeOp, ColumnToImage>)
+        {
+            auto col2img  = DeviceColToimgInstance{};
+            auto argument = col2img.MakeArgument(nullptr,
+                                                 nullptr,
+                                                 N,
+                                                 IsCPacked ? C : FakeC,
+                                                 input_spatial_lengths,
+                                                 filter_spatial_lengths,
+                                                 output_spatial_lengths,
+                                                 input_g_n_c_wis_strides,
+                                                 output_m_k_strides,
+                                                 conv_filter_strides,
+                                                 conv_filter_dilations,
+                                                 input_left_pads,
+                                                 input_right_pads);
+
+            return col2img.IsSupportedArgument(argument);
+        }
+    }
+};
+
+class TestConvTensorRearrangeInterface1ScalarPerVector
+    : public TestConvTensorRearrangeInterface<1, true>
+{
+};
+
+class TestConvTensorRearrangeInterface4ScalarPerVector
+    : public TestConvTensorRearrangeInterface<4, true>
+{
+};
+
+class TestConvTensorRearrangeInterface4ScalarPerVectorFakeC
+    : public TestConvTensorRearrangeInterface<4, false>
+{
+};
+
+TEST_F(TestConvTensorRearrangeInterface1ScalarPerVector, X1ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+}
+
+TEST_F(TestConvTensorRearrangeInterface4ScalarPerVector, X4ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+}
+
+TEST_F(TestConvTensorRearrangeInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
+{
+    // C = 3
+    this->conv_param  = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->template Run<ImageToColumn>();
+    EXPECT_FALSE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_FALSE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
+    is_supported     = this->template Run<ImageToColumn>();
+    EXPECT_TRUE(is_supported);
+    is_supported = this->template Run<ColumnToImage>();
+    EXPECT_TRUE(is_supported);
+}
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
index b1606d2a75..2409ca05c2 100644
--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -13,3 +13,5 @@ add_gtest_executable(test_bf8 bf8.cpp)
 if(result EQUAL 0)
   target_link_libraries(test_bf8 PRIVATE utility)
 endif()
+
+add_gtest_executable(test_type_convert_const type_convert_const.cpp)
diff --git a/test/data_type/type_convert_const.cpp b/test/data_type/type_convert_const.cpp
new file mode 100644
index 0000000000..8b9c34861a
--- /dev/null
+++ b/test/data_type/type_convert_const.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+
+using ck::bhalf_t;
+using ck::type_convert;
+
+TEST(TypeConvertConst, ConvertToConst)
+{
+    constexpr float bf16_epsilon = 0.0078125;
+    constexpr float rel_tol      = 2 * bf16_epsilon;
+
+    const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
+
+    for(float x : cases)
+    {
+        const float abs_tol = std::abs(rel_tol * x);
+        {
+            bhalf_t y = type_convert<bhalf_t>(x);
+            // Test non-const bhalf to const float.
+            const float y_float = type_convert<const float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test non-const float to const bhalf.
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+    }
+}
+
+TEST(TypeConvertConst, ConvertFromConst)
+{
+    constexpr float bf16_epsilon = 0.0078125;
+    constexpr float rel_tol      = 2 * bf16_epsilon;
+
+    const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
+
+    for(const float x : cases)
+    {
+        const float abs_tol = std::abs(rel_tol * x);
+        {
+            // Test const float to const bhalf_t.
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test const float to non-const bhalf.
+            bhalf_t y     = type_convert<bhalf_t>(x);
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Test const bhalf to non-const float.
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        // Tests with full type specializations for X.
+        {
+            // Test const float to const bhalf_t.
+            const bhalf_t y = type_convert<const bhalf_t, const float>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test const float to non-const bhalf.
+            bhalf_t y     = type_convert<bhalf_t, const float>(x);
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            const bhalf_t y = type_convert<const bhalf_t, const float>(x);
+            // Test const bhalf to non-const float.
+            float y_float = type_convert<float, const bhalf_t>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+    }
+}
diff --git a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
index 54b9c6c9e3..d583a3e377 100644
--- a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
+++ b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
@@ -2,7 +2,7 @@
 
 TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
     constexpr int K = 320;
 
@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
 
 TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
     constexpr int K = 320;
 
@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
 
 TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
     constexpr int K = 320;
 
@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
 
 TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
     constexpr int N = 512;
     constexpr int K = 320;
 
diff --git a/test/image_to_column/CMakeLists.txt b/test/image_to_column/CMakeLists.txt
deleted file mode 100644
index 0feb827b55..0000000000
--- a/test/image_to_column/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_gtest_executable(test_image_to_column test_image_to_column.cpp)
-target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance)
-add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp)
-target_link_libraries(test_image_to_column_interface PRIVATE utility)
diff --git a/test/image_to_column/test_image_to_column.cpp b/test/image_to_column/test_image_to_column.cpp
deleted file mode 100644
index 0b17cac2d0..0000000000
--- a/test/image_to_column/test_image_to_column.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <tuple>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "profiler/profile_image_to_column_impl.hpp"
-
-template <typename Tuple>
-class TestImageToColumn : public ::testing::Test
-{
-    protected:
-    using InDataType  = std::tuple_element_t<0, Tuple>;
-    using OutDataType = std::tuple_element_t<1, Tuple>;
-    using InLayout    = std::tuple_element_t<2, Tuple>;
-
-    std::vector<ck::utils::conv::ConvParam> conv_params;
-
-    template <ck::index_t NDimSpatial>
-    void Run()
-    {
-        EXPECT_FALSE(conv_params.empty());
-        bool pass = true;
-        for(auto& param : conv_params)
-        {
-            pass = pass && ck::profiler::profile_image_to_column_impl<NDimSpatial,
-                                                                      InLayout,
-                                                                      InDataType,
-                                                                      OutDataType>(
-                               true,  // do_verification
-                               1,     // init_method: integer value
-                               false, // do_log
-                               false, // time_kernel
-                               param);
-        }
-        EXPECT_TRUE(pass);
-    }
-};
-
-using namespace ck::tensor_layout::convolution;
-
-using KernelTypes1d = ::testing::Types<std::tuple<float, float, GNWC>,
-                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNWC>,
-                                       std::tuple<ck::half_t, ck::half_t, GNWC>,
-                                       std::tuple<int8_t, int8_t, GNWC>>;
-
-using KernelTypes2d = ::testing::Types<std::tuple<float, float, GNHWC>,
-                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNHWC>,
-                                       std::tuple<ck::half_t, ck::half_t, GNHWC>,
-                                       std::tuple<int8_t, int8_t, GNHWC>>;
-
-using KernelTypes3d = ::testing::Types<std::tuple<float, float, GNDHWC>,
-                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNDHWC>,
-                                       std::tuple<ck::half_t, ck::half_t, GNDHWC>,
-                                       std::tuple<int8_t, int8_t, GNDHWC>>;
-
-template <typename Tuple>
-class TestImageToColumn1d : public TestImageToColumn<Tuple>
-{
-};
-
-template <typename Tuple>
-class TestImageToColumn2d : public TestImageToColumn<Tuple>
-{
-};
-
-template <typename Tuple>
-class TestImageToColumn3d : public TestImageToColumn<Tuple>
-{
-};
-
-TYPED_TEST_SUITE(TestImageToColumn1d, KernelTypes1d);
-TYPED_TEST_SUITE(TestImageToColumn2d, KernelTypes2d);
-TYPED_TEST_SUITE(TestImageToColumn3d, KernelTypes3d);
-
-TYPED_TEST(TestImageToColumn1d, Test1D)
-{
-    this->conv_params.clear();
-
-    this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
-    this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
-    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {2}, {1}, {0}, {0}});
-    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
-    // ScalarPerVector should be 1
-    this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
-    // stride != 1
-    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
-    // dilation != 1
-    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
-    this->template Run<1>();
-}
-
-TYPED_TEST(TestImageToColumn2d, Test2D)
-{
-    this->conv_params.clear();
-
-    this->conv_params.push_back(
-        {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    this->conv_params.push_back(
-        {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
-    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
-    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
-    this->template Run<2>();
-}
-
-TYPED_TEST(TestImageToColumn3d, Test3D)
-{
-    this->conv_params.clear();
-    this->conv_params.push_back(
-        {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
-    this->conv_params.push_back(
-        {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
-    this->conv_params.push_back(
-        {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
-    this->template Run<3>();
-}
diff --git a/test/image_to_column/test_image_to_column_interface.cpp b/test/image_to_column/test_image_to_column_interface.cpp
deleted file mode 100644
index ea8b9632e1..0000000000
--- a/test/image_to_column/test_image_to_column_interface.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <tuple>
-#include <vector>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
-
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/algorithm.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using DataType = float;
-using InLayout = ck::tensor_layout::convolution::GNWC;
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-template <ck::index_t ScalarPerVector, bool IsCPacked>
-class TestImageToColumnInterface : public ::testing::Test
-{
-    protected:
-    static constexpr ck::index_t NDimSpatial = 1;
-
-    // clang-format off
-    using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread|         Scalar|
-        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|            Per|
-        //#####################|    Spatial|         |           |            |      |      |      |   Lengths|         Vector|
-        //#####################|           |         |           |            |      |      |      |          |               |
-                              < NDimSpatial, InLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
-    // clang-format on
-
-    ck::utils::conv::ConvParam conv_param;
-
-    bool Run()
-    {
-
-        const auto N = conv_param.N_;
-        const auto C = conv_param.C_;
-        const auto FakeC =
-            conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
-
-        const ck::index_t NDoHoWo =
-            N *
-            ck::accumulate_n<ck::index_t>(
-                conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
-        const ck::index_t CZYX =
-            C *
-            ck::accumulate_n<ck::index_t>(
-                conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
-
-        const auto in_desc =
-            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
-                conv_param);
-        const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
-
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
-        std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
-        std::array<ck::index_t, 2> output_m_k_strides{};
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
-        std::array<ck::index_t, NDimSpatial> input_left_pads{};
-        std::array<ck::index_t, NDimSpatial> input_right_pads{};
-
-        auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
-
-        copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
-        copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
-        copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
-        copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
-        copy(out_desc.GetStrides(), output_m_k_strides);
-        copy(conv_param.conv_filter_strides_, conv_filter_strides);
-        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
-        copy(conv_param.input_left_pads_, input_left_pads);
-        copy(conv_param.input_right_pads_, input_right_pads);
-
-        auto img2col  = DeviceImgToColInstance{};
-        auto argument = img2col.MakeArgument(nullptr,
-                                             nullptr,
-                                             N,
-                                             IsCPacked ? C : FakeC,
-                                             input_spatial_lengths,
-                                             filter_spatial_lengths,
-                                             output_spatial_lengths,
-                                             input_g_n_c_wis_strides,
-                                             output_m_k_strides,
-                                             conv_filter_strides,
-                                             conv_filter_dilations,
-                                             input_left_pads,
-                                             input_right_pads);
-
-        return img2col.IsSupportedArgument(argument);
-    }
-};
-
-class TestImageToColumnInterface1ScalarPerVector : public TestImageToColumnInterface<1, true>
-{
-};
-
-class TestImageToColumnInterface4ScalarPerVector : public TestImageToColumnInterface<4, true>
-{
-};
-
-class TestImageToColumnInterface4ScalarPerVectorFakeC : public TestImageToColumnInterface<4, false>
-{
-};
-
-TEST_F(TestImageToColumnInterface1ScalarPerVector, X1ScalarPerVector)
-{
-    // vector load C * X % ScalarPerVector
-    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
-    bool is_supported = this->Run();
-    EXPECT_TRUE(is_supported);
-    // vector load C * left_pad_x % ScalarPerVector
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-    // vector load C * right_pad_x % ScalarPerVector
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-
-    // vector load C % ScalarPerVector, right_pad and stride
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-    // vector load C % ScalarPerVector, left_pad and stride
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-    // vector load C % ScalarPerVector, dilation
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-
-    // C = 4
-    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-}
-
-TEST_F(TestImageToColumnInterface4ScalarPerVector, X4ScalarPerVector)
-{
-    // vector load C * X % ScalarPerVector
-    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
-    bool is_supported = this->Run();
-    EXPECT_FALSE(is_supported);
-    // vector load C * left_pad_x % ScalarPerVector
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
-    is_supported     = this->Run();
-    EXPECT_FALSE(is_supported);
-    // vector load C * right_pad_x % ScalarPerVector
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
-    is_supported     = this->Run();
-    EXPECT_FALSE(is_supported);
-
-    // vector load C % ScalarPerVector, right_pad and stride
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
-    is_supported     = this->Run();
-    EXPECT_FALSE(is_supported);
-    // vector load C % ScalarPerVector, left_pad and stride
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
-    is_supported     = this->Run();
-    EXPECT_FALSE(is_supported);
-    // vector load C % ScalarPerVector, dilation
-    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
-    is_supported     = this->Run();
-    EXPECT_FALSE(is_supported);
-
-    // C = 4
-    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-}
-
-TEST_F(TestImageToColumnInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
-{
-    // C = 3
-    this->conv_param  = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
-    bool is_supported = this->Run();
-    EXPECT_FALSE(is_supported);
-    // C = 4
-    this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
-    is_supported     = this->Run();
-    EXPECT_TRUE(is_supported);
-}