sync upstream again

2026-05-12 01:10:17 +00:00 · 2024-03-26 16:25:32 +00:00
parent 1c92c5d83d
commit f955af6ff7
49 changed files with 1975 additions and 3535 deletions
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
@@ -1,11 +0,0 @@
-add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp32 grouped_conv_fwd_scaleadd_ab_fp32.cpp)
-target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_conv_operations)
-
-add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp16 grouped_conv_fwd_scaleadd_ab_fp16.cpp)
-target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_conv_operations)
-
-add_executable(client_grouped_convnd_fwd_scaleadd_ab_bf16 grouped_conv_fwd_scaleadd_ab_bf16.cpp)
-target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_conv_operations)
-
-add_executable(client_grouped_convnd_fwd_scaleadd_ab_int8 grouped_conv_fwd_scaleadd_ab_int8.cpp)
-target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_conv_operations)
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab.inc
@@ -1,221 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iomanip>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <vector>
-
-#include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
-using InLayout    = ck::tensor_layout::convolution::NDHWGC;
-using WeiLayout   = ck::tensor_layout::convolution::GKZYXC;
-using OutLayout   = ck::tensor_layout::convolution::NDHWGK;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using ScaleAdd    = ck::tensor_operation::element_wise::ScaleAdd;
-
-static constexpr ck::index_t NumDimSpatial = 3;
-static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t N             = 64; // batch size
-static constexpr ck::index_t K             = 64; // output channel
-static constexpr ck::index_t C             = 32; // input channel (per group)
-static constexpr ck::index_t Z             = 3;  // filter D
-static constexpr ck::index_t Y             = 3;  // filter H
-static constexpr ck::index_t X             = 3;  // filter W
-static constexpr ck::index_t Di            = 14; // input D
-static constexpr ck::index_t Hi            = 14; // input H
-static constexpr ck::index_t Wi            = 14; // input W
-static constexpr ck::index_t Do            = 14; // output D
-static constexpr ck::index_t Ho            = 14; // output H
-static constexpr ck::index_t Wo            = 14; // output W
-
-struct SimpleDeviceMem
-{
-    SimpleDeviceMem() = delete;
-
-    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
-    {
-        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
-    }
-
-    void* GetDeviceBuffer() { return p_mem_; }
-
-    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
-
-    void* p_mem_;
-};
-
-int execute_conv_fwd_scaleadd_ab()
-{
-    constexpr ck::index_t NumAs = 2;
-    constexpr ck::index_t NumBs = 2;
-
-    constexpr float scale = 1.5f;
-
-    // We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
-    // However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
-    // Hence, we need to adjust the order of strides.
-    std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
-    std::array<ck::index_t, 6> in_strides{
-        C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
-    std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
-    std::array<ck::index_t, 6> wei_strides{
-        K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
-    std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
-    std::array<ck::index_t, 6> out_strides{
-        K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
-
-    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
-    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
-    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
-    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
-
-    using InputDtype      = ck::tuple_element_t<0, InDataType>;
-    using InputBiasDtype  = ck::tuple_element_t<1, InDataType>;
-    using WeightDtype     = ck::tuple_element_t<0, WeiDataType>;
-    using WeightBiasDtype = ck::tuple_element_t<1, WeiDataType>;
-
-    SimpleDeviceMem in(sizeof(InputDtype) * N * Di * Hi * Wi * G * C);
-    SimpleDeviceMem in_bias(sizeof(InputBiasDtype) * N * Di * Hi * Wi * G * C);
-    SimpleDeviceMem wei(sizeof(WeightDtype) * G * K * Z * Y * X * C);
-    SimpleDeviceMem wei_bias(sizeof(WeightBiasDtype) * G * K * Z * Y * X * C);
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
-
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
-                                                                                   InLayout,
-                                                                                   WeiLayout,
-                                                                                   ck::Tuple<>,
-                                                                                   OutLayout,
-                                                                                   InDataType,
-                                                                                   WeiDataType,
-                                                                                   ck::Tuple<>,
-                                                                                   OutDataType,
-                                                                                   ScaleAdd,
-                                                                                   ScaleAdd,
-                                                                                   PassThrough>;
-
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
-
-    std::string best_op_name;
-    int best_op_id        = -1;
-    float best_avg_time   = std::numeric_limits<float>::max();
-    float best_gb_per_sec = 0;
-    float best_tflops     = 0;
-
-    // profile device operation instances
-    std::cout << "Run all instances and do timing" << std::endl;
-
-    std::array<const void*, NumAs> as = {in.GetDeviceBuffer(), in_bias.GetDeviceBuffer()};
-    std::array<const void*, NumBs> bs = {wei.GetDeviceBuffer(), wei_bias.GetDeviceBuffer()};
-    std::array<const void*, 0> ds{};
-
-    for(int i = 0; i < op_ptrs.size(); ++i)
-    {
-        auto& op_ptr        = op_ptrs[i];
-        auto argument_ptr   = op_ptr->MakeArgumentPointer(as,
-                                                        bs,
-                                                        ds,
-                                                        out.GetDeviceBuffer(),
-                                                        in_lengths,
-                                                        in_strides,
-                                                        wei_lengths,
-                                                        wei_strides,
-                                                        {},
-                                                        {},
-                                                        out_lengths,
-                                                        out_strides,
-                                                        filter_strides,
-                                                        filter_dilations,
-                                                        input_left_pads,
-                                                        input_right_pads,
-                                                        ScaleAdd{scale},
-                                                        ScaleAdd{scale},
-                                                        PassThrough{});
-        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
-        std::string op_name = op_ptr->GetTypeString();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
-
-            std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Z * Y * X +
-                               N * Di * Hi * Wi * G * C + G * K * Z * Y * X * C;
-            std::size_t num_bytes = 2 * sizeof(InDataType) * N * Di * Hi * Wi * G * C +
-                                    2 * sizeof(WeiDataType) * G * K * Z * Y * X * C +
-                                    sizeof(OutDataType) * N * Do * Ho * Wo * G * K;
-
-            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-            float gb_per_sec = num_bytes / 1.E6 / avg_time;
-
-            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
-
-            if(tflops > best_tflops)
-            {
-                best_op_id      = i;
-                best_op_name    = op_name;
-                best_avg_time   = avg_time;
-                best_gb_per_sec = gb_per_sec;
-                best_tflops     = tflops;
-            }
-        }
-        else
-        {
-            std::cerr << op_name << " does not support this problem" << std::endl;
-        }
-    }
-
-    if(best_op_id < 0)
-    {
-        std::cerr << "no suitable instance" << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
-              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
-
-    // run the best intance
-    {
-        auto& op_ptr = op_ptrs[best_op_id];
-        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
-                  << std::endl;
-        auto argument_ptr = op_ptr->MakeArgumentPointer(as,
-                                                        bs,
-                                                        ds,
-                                                        out.GetDeviceBuffer(),
-                                                        in_lengths,
-                                                        in_strides,
-                                                        wei_lengths,
-                                                        wei_strides,
-                                                        {},
-                                                        {},
-                                                        out_lengths,
-                                                        out_strides,
-                                                        filter_strides,
-                                                        filter_dilations,
-                                                        input_left_pads,
-                                                        input_right_pads,
-                                                        ScaleAdd{scale},
-                                                        ScaleAdd{scale},
-                                                        PassThrough{});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
-        }
-
-        std::cout << "Done" << std::endl;
-    }
-    return 0;
-}
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/utility/data_type.hpp"
-#include "ck/utility/tuple.hpp"
-
-using InDataType  = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
-using WeiDataType = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
-using OutDataType = ck::bhalf_t;
-
-#include "grouped_conv_fwd_scaleadd_ab.inc"
-
-int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/utility/data_type.hpp"
-#include "ck/utility/tuple.hpp"
-
-using InDataType  = ck::Tuple<ck::half_t, ck::half_t>;
-using WeiDataType = ck::Tuple<ck::half_t, ck::half_t>;
-using OutDataType = ck::half_t;
-
-#include "grouped_conv_fwd_scaleadd_ab.inc"
-
-int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/utility/data_type.hpp"
-#include "ck/utility/tuple.hpp"
-
-using InDataType  = ck::Tuple<float, float>;
-using WeiDataType = ck::Tuple<float, float>;
-using OutDataType = float;
-
-#include "grouped_conv_fwd_scaleadd_ab.inc"
-
-int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
+++ b/client_example/24_grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/utility/data_type.hpp"
-#include "ck/utility/tuple.hpp"
-
-using InDataType  = ck::Tuple<int8_t, int8_t>;
-using WeiDataType = ck::Tuple<int8_t, int8_t>;
-using OutDataType = int8_t;
-
-#include "grouped_conv_fwd_scaleadd_ab.inc"
-
-int main() { return execute_conv_fwd_scaleadd_ab(); }
--- a/docs/API_Reference_Guide.rst
+++ b/docs/API_Reference_Guide.rst
@@ -1,54 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _api-reference:
-
-********************************************************************
-API reference guide
-********************************************************************
-
-
-This document contains details of the APIs for the Composable Kernel (CK) library and introduces
-some of the key design principles that are used to write new classes that extend CK functionality.
-
-=================
-Using CK API
-=================
-
-This section describes how to use the CK library API.
-
-=================
-CK Datatypes
-=================
-
-----------------
-DeviceMem
-----------------
-
-.. doxygenstruct:: DeviceMem
-
---------------------------
-Kernels For Flashattention
---------------------------
-
-The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This section lists
-the classes that are used in the CK GPU implementation of Flashattention.
-
-**Gridwise classes**
-
-.. doxygenstruct:: ck::GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
-
-**Blockwise classes**
-
-.. doxygenstruct:: ck::ThreadGroupTensorSliceTransfer_v4r1
-
-.. doxygenstruct:: ck::BlockwiseGemmXdlops_v2
-
-.. doxygenstruct:: ck::BlockwiseSoftmax
-
-**Threadwise classes**
-
-.. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic
-
-.. bibliography::
--- a/docs/Supported_Primitives_Guide.rst
+++ b/docs/Supported_Primitives_Guide.rst
@@ -1,80 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _supported-primitives:
-
-********************************************************************
-Supported Primitives Guide
-********************************************************************
-
-This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK.
-
------------
-Softmax
------------
-
-For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` you can decompose the
-softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
-
-.. math::
-   :nowrap:
-
-   \begin{align}
-      m(x) & = m( [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ] ) = \max( m(x^{(1)}),\ldots, m(x^{(T)}) )  \\
-      f(x) & = [\exp( m(x^{(1)}) - m(x) ) f( x^{(1)} )\ | \ \ldots \ | \ \exp( m(x^{(T)}) - m(x) ) f( x^{(T)} )] \\
-      z(x) & = \exp( m(x^{(1)}) - m(x) )\ z(x^{(1)}) + \ldots + \exp( m(x^{(T)}) - m(x) )\ z(x^{(1)}) \\
-      \operatorname{softmax}(x) &= f(x)\ / \ z(x)
-   \end{align}
-
-where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
-:math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.
-
-For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
-:math:`B_r \times B_c` you can compute the row-wise softmax as follows.
-
-For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,
-
-.. math::
-   :nowrap:
-
-   \begin{align}
-      \tilde{m}_{ij}   &= \operatorname{rowmax}( X_{ij} ) \\
-      \tilde{P}_{ij}   &= \exp(X_{ij} - \tilde{m}_{ij} ) \\
-      \tilde{z}_{ij}   &= \operatorname{rowsum}( P_{ij} ) \\
-   \end{align}
-
-If :math:`j=1`, initialize running max, running sum, and the first column block of the output,
-
-.. math::
-   :nowrap:
-
-   \begin{align}
-      m_i            &= \tilde{m}_{i1} \\
-      z_i            &= \tilde{z}_{i1} \\
-      \tilde{Y}_{i1} &= \diag(\tilde{z}_{ij})^{-1} \tilde{P}_{i1}
-   \end{align}
-
-Else if :math:`j>1`,
-
-1. Update running max, running sum and column blocks :math:`k=1` to :math:`k=j-1`
-
-.. math::
-   :nowrap:
-
-   \begin{align}
-      m^{new}_i &= \max(m_i, \tilde{m}_{ij} ) \\
-      z^{new}_i &= \exp(m_i - m^{new}_i)\ z_i + \exp( \tilde{m}_{ij} - m^{new}_i )\ \tilde{z}_{ij}  \\
-      Y_{ik}    &= \diag(z^{new}_{i})^{-1} \diag(z_{i}) \exp(m_i - m^{new}_i)\ Y_{ik}
-   \end{align}
-
-2. Initialize column block :math:`j` of output and reset running max and running sum variables:
-
-.. math::
-   :nowrap:
-
-   \begin{align}
-      \tilde{Y}_{ij} &= \diag(z^{new}_{i})^{-1} \exp(\tilde{m}_{ij} - m^{new}_i ) \tilde{P}_{ij} \\
-      z_i            &= z^{new}_i \\
-      m_i            &= m^{new}_i \\
-   \end{align}
--- a/docs/dockerhub.rst
+++ b/docs/dockerhub.rst
@@ -1,101 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _docker-hub:
-
-********************************************************************
-CK Docker Hub
-********************************************************************
-
-Why do I need this?
-===================
-
-To make things simpler, and bring Composable Kernel and its dependencies together, 
-docker images can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel/tags>`_. Docker images provide a complete image of the OS, the Composable Kernel library, and its dependencies in a single downloadable file. 
-
-Refer to `Docker Overview <https://docs.docker.com/get-started/overview/>`_ for more information on Docker images and containers.
-
-Which image is right for me?
-============================
-
-The image naming includes information related to the docker image. 
-For example ``ck_ub20.04_rocm6.0`` indicates the following:
-
-* ``ck`` - made for running Composable Kernel;
-* ``ub20.04`` - based on Ubuntu 20.04;
-* ``rocm6.0`` - ROCm platform version 6.0.
-
-Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. Use the ``docker pull`` command to download the file::
-
-    docker pull rocm/composable_kernel:ck_ub20.04_rocm6.0
-
-
-What is inside the image?
-------------------------
-
-The docker images have everything you need for running CK including:
-
-* `ROCm <https://www.amd.com/en/graphics/servers-solutions-rocm>`_
-* `CMake <https://cmake.org/getting-started/>`_
-* `Compiler <https://github.com/RadeonOpenCompute/llvm-project>`_
-* `Composable Kernel library <https://github.com/ROCm/composable_kernel>`_
-
-Running the docker container
-============================
-
-After downloading the docker image, you can start the container using one of a number of commands. Start with the ``docker run`` command as shown below::
-
-    docker run                                                            \
-    -it                                                                   \
-    --privileged                                                          \
-    --group-add sudo                                                      \
-    -w /root/workspace                                                    \
-    -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
-    rocm/composable_kernel:ck_ub20.04_rocm6.0                             \
-    /bin/bash
-
-After starting the bash shell, the docker container current folder is `~/workspace`. The library path is ``~/workspace/composable_kernel``. Navigate to the library to begin the tutorial as explained in :ref:`hello-world`:
-
-.. note::
-
-    If your current folder is different from `${HOME}`, adjust the line ``-v ${HOME}:/root/workspace`` in the ``docker run`` command to fit your folder structure.
-
-Stop and restart the docker image
-=================================
-
-After finishing the tutorial, or just when you have completed your work session, you can close the docker container, or stop the docker container to restart it at another time. Closing the docker container means that it is still in the active state, and can be resumed from where you left it. Stopping the container closes it, and returns the image to its initial state. 
-
-Use the ``Ctrl-D`` option to exit the container, while leaving it active, so you can return to the container in its current state to resume the tutorial, or pickup your project where you left off. 
-
-To restart the active container use the ``docker exec`` command to specify the container name and options as follows::
-
-    docker exec -it <container_name> bash
-
-Where: 
-
-* `exec` is the docker command
-* `-it` is the interactive option for `exec`
-* `<container_name>` specifies an active container on the system
-* `bash` specifies the command to run in the interactive shell
-
-.. note::
-
-    You can use the ``docker container ls`` command to list the active containers on the system.
-
-To start a container from the image, use the ``docker start`` command::
-
-    docker start <container_name>
-
-Then use the docker exec command as shown above to start the bash shell. 
-
-Use the ``docker stop`` command to stop the container and restore the image to its initial state::
-
-    docker stop <container_name>
-    
-Editing the docker image
-=======================
-
-If you want to customize the docker image, edit the
-`Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_
-from the GitHub repository to suit your needs.
--- a/docs/license.md
+++ b/docs/license.md
@@ -1,2 +0,0 @@
-```{include} ../LICENSE.md
-```
--- a/docs/tutorial_hello_world.rst
+++ b/docs/tutorial_hello_world.rst
@@ -1,165 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _hello-world:
-
-********************************************************************
-Hello World Tutorial
-********************************************************************
-
-This tutorial is for engineers dealing with artificial intelligence and machine learning who
-would like to optimize pipelines and improve performance using the Composable
-Kernel (CK) library. This tutorial provides an introduction to the CK library. You will build the library and run some examples using a "Hello World" example. 
-
-Description
-===========
-
-Modern AI technology solves more and more problems in a variety of fields, but crafting fast and
-efficient workflows is still challenging. CK can make the AI workflow fast
-and efficient. CK is a collection of optimized AI operator kernels with tools to create
-new kernels. The library has components required for modern neural network architectures
-including matrix multiplication, convolution, contraction, reduction, attention modules, a variety of activation functions, and fused operators.
-
-CK library acceleration features are based on:
-
-* Layered structure
-* Tile-based computation model
-* Tensor coordinate transformation
-* Hardware acceleration use
-* Support of low precision data types including fp16, bf16, int8 and int4
-
-If you need more technical details and benchmarking results read the following 
-`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
-
-To download the library visit the `composable_kernel repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.
-
-Hardware targets
-================
-
-CK library fully supports `gfx908` and `gfx90a` GPU architectures, while only some operators are
-supported for `gfx1030` devices. Check your hardware to determine the target GPU architecture.
-
-==========     =========
-GPU Target     AMD GPU
-==========     =========
-gfx908 	       Radeon Instinct MI100
-gfx90a 	       Radeon Instinct MI210, MI250, MI250X
-gfx1030        Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
-==========     =========
-
-There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
-you don't have an AMD GPU at hand.
-
-Build the library
-=================
-
-This tutorial is based on the use of docker images as explained in :ref:`docker-hub`. Download a docker image suitable for your OS and ROCm release, run or start the docker container, and then resume the tutorial from this point. 
-
-.. note::
-
-   You can also `install ROCm <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>`_ on your system, clone the `Composable Kernel repository <https://github.com/ROCmSoftwarePlatform/composable_kernel.git>`_ on GitHub, and use that to build and run the examples using the commands described below.
-
-Both the docker container and GitHub repository include the Composable Kernel library. Navigate to the library::
-
-    cd composable_kernel/
-
-Create and change to a ``build`` directory::
-
-    mkdir build && cd build
-
-The previous section discussed supported GPU architecture. Once you decide which hardware targets are needed, run CMake using the ``GPU_TARGETS`` flag::
-
-    cmake  \
-    -D CMAKE_PREFIX_PATH=/opt/rocm  \
-    -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc  \
-    -D CMAKE_CXX_FLAGS="-O3"  \
-    -D CMAKE_BUILD_TYPE=Release  \
-    -D BUILD_DEV=OFF  \
-    -D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
-
-If everything goes well the CMake command will return::
-
-    -- Configuring done
-    -- Generating done
-    -- Build files have been written to: "/root/workspace/composable_kernel/build"
-
-Finally, you can build examples and tests::
-
-    make -j examples tests
-
-When complete you should see::
-
-    Scanning dependencies of target tests
-    [100%] Built target tests
-
-Run examples and tests
-======================
-
-Examples are listed as test cases as well, so you can run all examples and tests with::
-
-    ctest
-
-You can check the list of all tests by running::
-
-    ctest -N
-
-You can also run examples separately as shown in the following example execution::
-
-    ./bin/example_gemm_xdl_fp16 1 1 1
-
-The arguments ``1 1 1`` mean that you want to run this example in the mode: verify results with CPU, initialize matrices with integers, and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change.
-
-If you have a device based on `gfx908` or `gfx90a` architecture, and if the example runs as expected, you should see something like::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    Perf: 1.08153 ms, 119.136 TFlops, 89.1972 GB/s, DeviceGemm_Xdl_CShuffle<Default, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, 8, 4, 1, 2> LoopScheduler: Interwave, PipelineVersion: v1
-
-However, running it on a `gfx1030` device should result in the following::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
-
-Don't worry, some operators are supported on `gfx1030` architecture, so you can run a
-separate example like::
-
-    ./bin/example_gemm_dl_fp16 1 1 1
-
-and it should return something like::
-
-    a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096}
-    b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
-    c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-    arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2}
-    arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-    arg.c_grid_desc_m_n_{ 3840, 4096}
-    launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1}
-    Warm up 1 time
-    Start running 10 times...
-    Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
-
-.. note::
-
-    A new CMake flag ``DL_KERNELS`` has been added to the latest versions of CK. If you do not see the above results when running ``example_gemm_dl_fp16``, you might need to add ``-D DL_KERNELS=ON`` to your CMake command to build the operators supported on the `gfx1030` architecture.
-
-You can also run a separate test::
-
-    ctest -R test_gemm_fp16
-
-If everything goes well you should see something like::
-
-    Start 121: test_gemm_fp16
-    1/1 Test #121: test_gemm_fp16 ...................   Passed   51.81 sec
-
-    100% tests passed, 0 tests failed out of 1
-
-Summary
-=======
-
-In this tutorial you took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. In the next tutorial you will run kernels with different configurations to find out the best one for your hardware and task.
-
-P.S.: If you are running on a cloud instance, don't forget to switch off the cloud instance. 
--- a/docs/what-is-ck.rst
+++ b/docs/what-is-ck.rst
@@ -1,41 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _what-is-ck:
-
-********************************************************************
-What is the Composable Kernel library
-********************************************************************
-
-
-Methodology
-===========
-
-The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
-
-CK utilizes two concepts to achieve performance portability and code maintainability:
-
-* A tile-based programming model
-* Algorithm complexity reduction for complex ML operators using an innovative technique called
-  "Tensor Coordinate Transformation".
-
-.. image:: data/ck_component.png
-   :alt: CK Components
-
-
-Code Structure
-==============
-
-The CK library is structured into 4 layers:
-
-* "Templated Tile Operators" layer
-* "Templated Kernel and Invoker" layer
-* "Instantiated Kernel and Invoker" layer
-* "Client API" layer
-
-It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
-
-.. image:: data/ck_layer.png
-   :alt: CK Layers
-   
--- a/docs/wrapper.rst
+++ b/docs/wrapper.rst
@@ -1,92 +0,0 @@
-.. meta::
-  :description: Composable Kernel documentation and API reference library
-  :keywords: composable kernel, CK, ROCm, API, documentation
-
-.. _wrapper:
-
-********************************************************************
-Wrapper
-********************************************************************
-
-------------------------------------
-Description
-------------------------------------
-
-.. note::
-
-    The wrapper is under development and its functionality is limited.
-
-
-The CK library provides a lightweight wrapper for more complex operations implemented in 
-the library.
-
-Example:
-
-.. code-block:: c
-
-    const auto shape_4x2x4         = ck::make_tuple(4, ck::make_tuple(2, 4));
-    const auto strides_s2x1x8      = ck::make_tuple(2, ck::make_tuple(1, 8));
-    const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
-    
-    std::array<ck::index_t, 32> data;
-    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
-
-    for(ck::index_t w = 0; w < size(tensor); w++) {
-        tensor(w) = w;
-    }
-
-    // slice() == slice(0, -1) (whole dimension)
-    auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
-    std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
-    for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
-    {
-        for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
-        {
-            std::cout << tensor_slice(h, w) << " ";
-        }
-        std::cout << std::endl;
-    }
-
-Output::
-
-    dims:2,(2,4) strides:2,(1,8)
-    1 5 9 13 17 21 25 29 
-    2 6 10 14 18 22 26 30 
-
-
-Advanced examples:
-
-* `Image to column <https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_img2col.cpp>`_
-
-------------------------------------
-Layout
-------------------------------------
-
-.. doxygenstruct:: ck::wrapper::Layout
-
-------------------------------------
-Layout helpers
-------------------------------------
-
-.. doxygenfile:: layout_utils.hpp
-
-------------------------------------
-Tensor
-------------------------------------
-
-.. doxygenstruct:: ck::wrapper::Tensor
-
-------------------------------------
-Tensor helpers
-------------------------------------
-
-.. doxygenfile:: tensor_utils.hpp
-
-.. doxygenfile:: tensor_partition.hpp
-
-------------------------------------
-Operations
-------------------------------------
-
-.. doxygenfile:: copy.hpp
-.. doxygenfile:: gemm.hpp
--- a/example/62_convnd_activ/CMakeLists.txt
+++ b/example/62_convnd_activ/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_subdirectory(binary)
+add_subdirectory(multi_AB)
+add_subdirectory(unary)
+
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_xdl)
+      # ScaleAdd ScaleAdd Relu
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16)
+      add_example_executable(example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/binary/CMakeLists.txt
+++ b/example/62_convnd_activ/binary/CMakeLists.txt
@@ -0,0 +1,13 @@
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_binary_xdl)
+      # Bilinear residual
+      add_example_executable(example_convnd_fwd_xdl_bilinear_residual_fp16 convnd_fwd_xdl_bilinear_residual_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_fwd_xdl_bilinear_residual_fp16)
+      add_example_executable(example_convnd_bwd_data_xdl_bilinear_residual_fp16 convnd_bwd_data_xdl_bilinear_residual_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_binary_xdl example_convnd_bwd_data_xdl_bilinear_residual_fp16)
+      set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_bwd_data_xdl_bilinear_residual_fp16.cpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDBwdDataInstance =
+    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<
+        NDimSpatial,
+        OutLayout,
+        WeiLayout,
+        ck::Tuple<InLayout>,
+        InLayout,
+        OutDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<InDataType>,
+        InDataType,
+        OutElementOp,
+        WeiElementOp,
+        InElementOp,
+        ConvSpec, // ConvForwardSpecialization
+        true,
+        true,
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        2,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<0, 2, 1>,  // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1>,  // BBlockTransferSrcAccessOrder
+        1,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        2,           // BBlockTransferDstScalarPerVector_BK1
+        0,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDBwdDataInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 1;
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        in_host.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        in_host.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+    }
+
+    // Initialize based on out_host
+    Tensor<InDataType> in_device(in_host);
+
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    in_device_buf.ToDevice(in_device.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // Use output as D
+    const std::array<const void*, NumDs> ds = {in_device_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        out_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        ds,
+        in_device_buf.GetDeviceBuffer(),
+        a_g_n_k_wos_lengths,
+        a_g_n_k_wos_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_c_wis_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_c_wis_strides},
+        e_g_n_c_wis_lengths,
+        e_g_n_c_wis_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        out_element_op,
+        wei_element_op,
+        in_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop =
+        conv_param.GetFlops() + 3 * conv_param.GetInputByte<InDataType>() / sizeof(InDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetOutputByte<InDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        std::array<Tensor<OutDataType>, NumDs> d_tensors = {in_host};
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             OutDataType,
+                                                             InElementOp,
+                                                             WeiElementOp,
+                                                             OutElementOp,
+                                                             0, /*Num A Elementwise Tensors*/
+                                                             0, /*Num B Elementwise Tensors*/
+                                                             NumDs>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_host,
+                                                  wei,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        in_device_buf.FromDevice(in_device.mData.data());
+
+        return ck::utils::check_err(in_device.mData, in_host.mData);
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
+++ b/example/62_convnd_activ/binary/convnd_fwd_xdl_bilinear_residual_fp16.cpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using OutElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 1;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        out_host.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        out_host.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+
+    // Initialize based on out_host
+    Tensor<OutDataType> out_device(out_host);
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    out_device_buf.ToDevice(out_device.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // Use output as D
+    const std::array<const void*, NumDs> ds = {out_device_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        ds,
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_k_wos_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{e_g_n_k_wos_strides},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop =
+        conv_param.GetFlops() + 3 * conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetOutputByte<OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        std::array<Tensor<OutDataType>, NumDs> d_tensors = {out_host};
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
+++ b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+using BiasLayout = ck::tensor_layout::convolution::G_K;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 2;
+    const ck::index_t G         = out_g_n_k_wos_desc.GetLengths()[0];
+    const ck::index_t K         = out_g_n_k_wos_desc.GetLengths()[2];
+
+    // Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
+    std::array<ck::index_t, NDimSpatial + 3> bias_g_k_lengths;
+    std::array<ck::index_t, NDimSpatial + 3> bias_g_k_strides;
+    // Fill other lenghts than G,K with 1 and strides with 0
+    bias_g_k_lengths.fill(1);
+    bias_g_k_strides.fill(0);
+    bias_g_k_lengths[0]              = G;
+    bias_g_k_lengths[2]              = K;
+    bias_g_k_strides[0]              = K; // stride to G
+    bias_g_k_strides[2]              = 1; // stride to K
+    const auto broadcasted_bias_desc = HostTensorDescriptor(bias_g_k_lengths, bias_g_k_strides);
+
+    //  y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
+                                                        Tensor<OutDataType>(broadcasted_bias_desc)};
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+    std::cout << "z_tensor: " << d_tensors[0].mDesc << std::endl;
+    std::cout << "bias_tensor: " << d_tensors[1].mDesc << std::endl;
+
+    // Make sure that we allocated only G * K values for bias
+    assert(static_cast<ck::index_t>(d_tensors[1].mData.size()) == G * K);
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem z_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
+    DeviceMem bias_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    z_buf.ToDevice(d_tensors[0].mData.data());
+    bias_buf.ToDevice(d_tensors[1].mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    const std::array<const void*, NumDs> ds = {z_buf.GetDeviceBuffer(), bias_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_lengths, bias_g_k_lengths},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_strides, bias_g_k_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = conv_param.GetFlops() + G * K +
+                       conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            G * K * sizeof(OutDataType) + conv_param.GetOutputByte<OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
+++ b/example/62_convnd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using OutElementOp = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<OutLayout, OutLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<OutDataType, OutDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+
+namespace {
+// Use custom implementation to pass two more tensors for post op
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumDs = 2;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    std::array<Tensor<OutDataType>, NumDs> d_tensors = {Tensor<OutDataType>(out_g_n_k_wos_desc),
+                                                        Tensor<OutDataType>(out_g_n_k_wos_desc)};
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        d_tensors[0].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+        d_tensors[1].GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.05, 0.05});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem d0_buf(sizeof(OutDataType) * d_tensors[0].mDesc.GetElementSpaceSize());
+    DeviceMem d1_buf(sizeof(OutDataType) * d_tensors[1].mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    d0_buf.ToDevice(d_tensors[0].mData.data());
+    d1_buf.ToDevice(d_tensors[1].mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    const std::array<const void*, NumDs> ds = {d0_buf.GetDeviceBuffer(), d1_buf.GetDeviceBuffer()};
+
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_lengths, e_g_n_k_wos_lengths},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, NumDs>{
+                                          e_g_n_k_wos_strides, e_g_n_k_wos_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop =
+        conv_param.GetFlops() + 2 * conv_param.GetOutputByte<OutDataType>() / sizeof(OutDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            2 * conv_param.GetOutputByte<OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv =
+            ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                         InDataType,
+                                                         WeiDataType,
+                                                         OutDataType,
+                                                         InElementOp,
+                                                         WeiElementOp,
+                                                         OutElementOp,
+                                                         0, /*Num A Elementwise Tensors*/
+                                                         0, /*Num B Elementwise Tensors*/
+                                                         NumDs>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  {},
+                                                  {},
+                                                  d_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
+
+#include "run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/multi_AB/CMakeLists.txt
+++ b/example/62_convnd_activ/multi_AB/CMakeLists.txt
@@ -0,0 +1,17 @@
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_multi_ab_xdl)
+      # ScaleAdd on A and B
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp16 conv_fwd_xdl_scaleadd_ab_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_fp32 conv_fwd_xdl_scaleadd_ab_fp32.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_fp32)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_bf16 conv_fwd_xdl_scaleadd_ab_bf16.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_bf16)
+      add_example_executable(example_conv_fwd_xdl_scaleadd_ab_int8 conv_fwd_xdl_scaleadd_ab_int8.cpp)
+      add_example_dependencies(example_convnd_activ_multi_ab_xdl example_conv_fwd_xdl_scaleadd_ab_int8)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
+++ b/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_bf16.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = ck::bhalf_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
+++ b/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp16.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = ck::half_t;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
+++ b/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_fp32.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = float;
+using AccDataType = float;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
+++ b/example/62_convnd_activ/multi_AB/conv_fwd_xdl_scaleadd_ab_int8.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_multi_ab_common.hpp"
+
+using DataType    = int8_t;
+using AccDataType = int32_t;
+using InDataType  = DataType;
+using WeiDataType = DataType;
+using OutDataType = DataType;
+using ADataTypes  = ck::Tuple<DataType, DataType>;
+using BDataTypes  = ck::Tuple<DataType, DataType>;
+
+using InElementOp  = ck::tensor_operation::element_wise::ScaleAdd;
+using WeiElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDMultiABFwdInstance<DataType,
+                                                                               AccDataType,
+                                                                               ADataTypes,
+                                                                               BDataTypes,
+                                                                               InElementOp,
+                                                                               WeiElementOp>;
+
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
+++ b/example/62_convnd_activ/multi_AB/convnd_fwd_activ_multi_ab_common.hpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename DataType,
+          typename AccDataType,
+          typename InDataTypes,
+          typename WeiDataTypes,
+          typename InElementOp,
+          typename WeiElementOp>
+using DeviceGroupedConvNDMultiABFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataTypes,
+        WeiDataTypes,
+        AccDataType,
+        DataType,
+        ck::Tuple<>,
+        DataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+namespace {
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    constexpr ck::index_t NumAs = 2;
+    constexpr ck::index_t NumBs = 2;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_bias(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> wei_bias(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        in_bias.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        wei_bias.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        in_bias.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+        wei_bias.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem in_bias_device_buf(sizeof(InDataType) * in_bias.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem wei_bias_device_buf(sizeof(WeiDataType) * wei_bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    in_bias_device_buf.ToDevice(in_bias.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    wei_bias_device_buf.ToDevice(wei_bias.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    std::array<const void*, NumAs> as{in_device_buf.GetDeviceBuffer(),
+                                      in_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, NumBs> bs{wei_device_buf.GetDeviceBuffer(),
+                                      wei_bias_device_buf.GetDeviceBuffer()};
+    std::array<const void*, 0> ds{};
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(as,
+                                      bs,
+                                      ds,
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {},
+                                      {},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = conv_param.GetFlops() +
+                       2 * conv_param.GetOutputByte<InDataType>() / sizeof(InDataType) +
+                       2 * conv_param.GetOutputByte<WeiDataType>() / sizeof(WeiDataType);
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>() +
+                            conv_param.GetInputByte<InDataType>() +
+                            conv_param.GetWeightByte<WeiDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        const std::array<Tensor<InDataType>, NumAs - 1> elementwise_a_tensors  = {in_bias};
+        const std::array<Tensor<WeiDataType>, NumBs - 1> elementwise_b_tensors = {wei_bias};
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp,
+                                                                     NumAs - 1,
+                                                                     NumBs - 1>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op,
+                                                  elementwise_a_tensors,
+                                                  elementwise_b_tensors);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
+
+} // namespace
--- a/example/62_convnd_activ/run_convnd_activ_example.inc
+++ b/example/62_convnd_activ/run_convnd_activ_example.inc
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+bool run_convnd_example(int argc, char* argv[])
+{
+    print_helper_msg();
+
+    bool do_verification = true;
+    // Use floats for SoftRelu by default to avoid overflow after e^x.
+    int init_method =
+        std::is_same_v<OutElementOp, ck::tensor_operation::element_wise::SoftRelu> ? 2 : 1;
+    bool time_kernel = false;
+
+    // Following shapes are selected to avoid overflow. Expect inf in case of
+    // size increase for some elementwise ops.
+    ck::utils::conv::ConvParam conv_param{
+        3, 2, 16, 128, 8, {3, 3, 3}, {17, 17, 17}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto run = [&]() {
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_grouped_conv<NDimSpatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                DeviceGroupedConvNDActivInstance>(do_verification,
+                                                                  init_method,
+                                                                  time_kernel,
+                                                                  conv_param,
+                                                                  in_g_n_c_wis_desc,
+                                                                  wei_g_k_c_xs_desc,
+                                                                  out_g_n_k_wos_desc,
+                                                                  in_element_op,
+                                                                  wei_element_op,
+                                                                  out_element_op);
+    };
+
+    if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run();
+    }
+
+    return false;
+}
--- a/example/62_convnd_activ/unary/CMakeLists.txt
+++ b/example/62_convnd_activ/unary/CMakeLists.txt
@@ -0,0 +1,35 @@
+list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942)
+set(target 0)
+foreach(gpu IN LISTS GPU_TARGETS)
+ if(gpu IN_LIST gpu_list AND target EQUAL 0)
+      add_custom_target(example_convnd_activ_unary_xdl)
+      # Sigmoid
+      add_example_executable(example_convnd_fwd_xdl_sigmoid_fp16 convnd_fwd_xdl_sigmoid_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_sigmoid_fp16)
+      # Tanh
+      add_example_executable(example_convnd_fwd_xdl_tanh_fp16 convnd_fwd_xdl_tanh_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_tanh_fp16)
+      # Relu
+      add_example_executable(example_convnd_fwd_xdl_relu_fp16 convnd_fwd_xdl_relu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_relu_fp16)
+      # SoftRelu
+      add_example_executable(example_convnd_fwd_xdl_softrelu_fp16 convnd_fwd_xdl_softrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_softrelu_fp16)
+      # Abs
+      add_example_executable(example_convnd_fwd_xdl_abs_fp16 convnd_fwd_xdl_abs_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_abs_fp16)
+      # Pow
+      add_example_executable(example_convnd_fwd_xdl_pow_fp16 convnd_fwd_xdl_pow_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_pow_fp16)
+      # Clipped Relu
+      add_example_executable(example_convnd_fwd_xdl_clippedrelu_fp16 convnd_fwd_xdl_clippedrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_clippedrelu_fp16)
+      # Leaky Relu
+      add_example_executable(example_convnd_fwd_xdl_leakyrelu_fp16 convnd_fwd_xdl_leakyrelu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_leakyrelu_fp16)
+      # Elu
+      add_example_executable(example_convnd_fwd_xdl_elu_fp16 convnd_fwd_xdl_elu_fp16.cpp)
+      add_example_dependencies(example_convnd_activ_unary_xdl example_convnd_fwd_xdl_elu_fp16)
+   set(target 1)
+ endif()
+endforeach()
--- a/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_activ_unary_common.hpp
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InLayout  = ck::tensor_layout::convolution::GNDHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::GNDHWK;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename OutElementOp>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!");
+    }
+
+    return true;
+}
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_abs_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_abs_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::UnaryAbs;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_clippedrelu_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_clippedrelu_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::ClippedRelu;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_elu_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_elu_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::Elu;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_leakyrelu_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_leakyrelu_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::LeakyRelu;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_pow_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_pow_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::Power;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_relu_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_relu_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::Relu;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_sigmoid_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_sigmoid_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::Sigmoid;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_softrelu_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_softrelu_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::SoftRelu;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/example/62_convnd_activ/unary/convnd_fwd_xdl_tanh_fp16.cpp
+++ b/example/62_convnd_activ/unary/convnd_fwd_xdl_tanh_fp16.cpp
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_activ_unary_common.hpp"
+
+using OutElementOp = ck::tensor_operation::element_wise::TanH;
+
+using DeviceGroupedConvNDActivInstance = DeviceGroupedConvNDFwdInstance<OutElementOp>;
+#include "../run_convnd_activ_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_example(argc, argv); }
--- a/test/permute_scale/test_permute_scale_impl.hpp
+++ b/test/permute_scale/test_permute_scale_impl.hpp
@@ -1,212 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include <iomanip>
-#include <random>
-
-#include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
-
-#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-
-namespace ck {
-template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
-void host_elementwise4D(HostTensorB& B_nhwc,
-                        const HostTensorA& A_nchw,
-                        FunctorA functor_a,
-                        FunctorB functor_b,
-                        float scale)
-{
-    std::size_t N = A_nchw.mDesc.GetLengths()[0];
-    std::size_t C = A_nchw.mDesc.GetLengths()[1];
-    std::size_t H = A_nchw.mDesc.GetLengths()[2];
-    std::size_t W = A_nchw.mDesc.GetLengths()[3];
-    for(std::size_t w = 0; w < W; ++w)
-        for(std::size_t h = 0; h < H; ++h)
-            for(std::size_t c = 0; c < C; ++c)
-                for(std::size_t n = 0; n < N; ++n)
-                {
-                    using tmp_type   = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
-                    tmp_type tmp_val = 0;
-                    auto a_val       = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
-                    functor_b(tmp_val, a_val);
-                    functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
-                              scale * tmp_val);
-                }
-}
-
-template <typename ADataType, typename BDataType, index_t NumDim>
-bool test_permute_scale_impl(int do_verification,
-                             int init_method,
-                             bool do_log,
-                             bool time_kernel,
-                             std::vector<index_t> lengths)
-{
-    bool pass = true;
-
-    using ElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using UnaryOp   = ck::tensor_operation::element_wise::UnarySquare;
-    using Scale     = ck::tensor_operation::element_wise::Scale;
-    float scale     = 2.f;
-
-    index_t N = lengths[0];
-    index_t C = lengths[1];
-    index_t H = lengths[2];
-    index_t W = lengths[3];
-
-    std::vector<ck::index_t> nchw = {N, C, H, W};
-    std::vector<ck::index_t> nhwc = {N, H, W, C};
-    Tensor<ADataType> a(nchw);
-    Tensor<BDataType> b(nhwc);
-    Tensor<BDataType> host_b(nhwc);
-
-    std::array<ck::index_t, 4> ab_lengths;
-
-    std::array<ck::index_t, 4> a_strides = {1,
-                                            static_cast<int>(nchw[0]),
-                                            static_cast<int>(nchw[0] * nchw[1]),
-                                            static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
-
-    std::array<ck::index_t, 4> b_strides = {1,
-                                            static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
-                                            static_cast<int>(nhwc[0]),
-                                            static_cast<int>(nhwc[0] * nhwc[1])};
-    ck::ranges::copy(nchw, ab_lengths.begin());
-
-    std::cout << "A: " << a.mDesc << std::endl;
-    std::cout << "B: " << b.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
-    default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
-        std::mt19937 gen(11939);
-        std::uniform_int_distribution<int> dis(0, 1);
-        auto i = 0;
-        for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
-            for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
-                for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
-                    for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
-                    {
-                        a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
-                                (h * nchw[3]) + w] = i;
-                        i                          = dis(gen);
-                    }
-    }
-
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-
-    a_device_buf.ToDevice(a.mData.data());
-
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
-                                                                     ck::Tuple<BDataType>,
-                                                                     ElementOp,
-                                                                     UnaryOp,
-                                                                     Scale,
-                                                                     NumDim>;
-
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
-
-    std::string best_instance_name;
-    float best_ave_time   = std::numeric_limits<float>::max();
-    float best_gb_per_sec = 0;
-    float best_tflops     = 0;
-
-    if(do_verification)
-    {
-        host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
-    }
-
-    for(auto& op_ptr : op_ptrs)
-    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
-                                                        {a_strides},
-                                                        {b_strides},
-                                                        input,
-                                                        output,
-                                                        ElementOp{},
-                                                        UnaryOp{},
-                                                        Scale{scale});
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            b_device_buf.SetZero();
-
-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
-
-            if(do_verification)
-            {
-                b_device_buf.FromDevice(b.mData.data());
-
-                pass &= ck::utils::check_err(
-                    b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-
-                if(do_log)
-                {
-                    LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
-                }
-            }
-
-            std::string op_name = op_ptr->GetTypeString();
-
-            float ave_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-
-            std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
-
-            std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
-                                    sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
-
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
-
-            if(tflops > best_tflops)
-            {
-                best_instance_name = op_name;
-                best_tflops        = tflops;
-                best_ave_time      = ave_time;
-                best_gb_per_sec    = gb_per_sec;
-            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
-        }
-    }
-    if(time_kernel)
-    {
-        LogRange(std::cout << "length = ", lengths, ",") << ", ";
-        std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
-                  << best_instance_name << std::endl;
-    }
-
-    return true;
-}
-
-} // namespace ck
--- a/test/wrapper/CMakeLists.txt
+++ b/test/wrapper/CMakeLists.txt
@@ -1,21 +0,0 @@
-add_custom_target(test_wrapper)
-
-add_gtest_executable(test_wrapper_layout test_wrapper_layout.cpp)
-target_link_libraries(test_wrapper_layout PRIVATE utility)
-add_dependencies(test_wrapper test_wrapper_layout)
-add_gtest_executable(test_wrapper_tensor test_wrapper_tensor.cpp)
-target_link_libraries(test_wrapper_tensor PRIVATE utility)
-add_dependencies(test_wrapper test_wrapper_tensor)
-add_gtest_executable(test_wrapper_copy test_wrapper_copy.cpp)
-target_link_libraries(test_wrapper_copy PRIVATE utility)
-add_dependencies(test_wrapper test_wrapper_copy)
-add_gtest_executable(test_wrapper_partition test_wrapper_partition.cpp)
-target_link_libraries(test_wrapper_partition PRIVATE utility)
-add_dependencies(test_wrapper test_wrapper_partition)
-if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
-   GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
-   GPU_TARGETS MATCHES "gfx942")
-    add_gtest_executable(test_wrapper_gemm test_wrapper_gemm.cpp)
-    target_link_libraries(test_wrapper_gemm PRIVATE utility)
-    add_dependencies(test_wrapper test_wrapper_gemm)
-endif()
--- a/test/wrapper/test_copy.cpp
+++ b/test/wrapper/test_copy.cpp
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <numeric>
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-#include "ck/wrapper/operations/copy.hpp"
-
-// Test copy from Global to Global through LDS and VGPR
-template <typename InputTensor,
-          typename OutputTensor,
-          typename BlockShape,
-          typename ThreadLayoutShape,
-          bool UseOptimizedCopy>
-__global__ void TestCopyDevice(const InputTensor input_tensor,
-                               OutputTensor output_tensor,
-                               const BlockShape tile_shape,
-                               const ThreadLayoutShape thread_layout)
-{
-    __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
-    const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        p_shared, ck::wrapper::make_layout(tile_shape));
-
-    const auto block_idx = static_cast<ck::index_t>(blockIdx.x);
-
-    // Get local tiles for global memory
-    const auto input_local_tile = ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idx);
-    const auto output_local_tile =
-        ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idx);
-
-    // Get partition per thread
-    const auto input_local_partition =
-        ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
-    auto lds_local_partition =
-        ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
-    auto output_local_partition =
-        ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
-
-    // Allocate VGPR
-    auto tensor_vgpr =
-        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
-            layout(lds_local_partition));
-
-    // Perform copy
-    if constexpr(UseOptimizedCopy)
-    {
-        using DimAccessOrder                    = ck::Tuple<ck::Number<1>, ck::Number<0>>;
-        constexpr ck::index_t vector_dim        = 0;
-        constexpr ck::index_t scalar_per_vector = 2;
-        ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
-                                                                         lds_local_partition);
-        // TODO: Enable optimized copy for static buffers
-        ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
-                                                                         tensor_vgpr);
-        ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
-                                                                         output_local_partition);
-    }
-    else
-    {
-        ck::wrapper::copy(input_local_partition, lds_local_partition);
-        ck::wrapper::copy(lds_local_partition, tensor_vgpr);
-        ck::wrapper::copy(tensor_vgpr, output_local_partition);
-    }
-}
-
-template <bool UseOptimizedCopy>
-void PerformCopyGlobalToGlobalViaLDS()
-{
-    const auto shape =
-        ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
-    const auto strides =
-        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
-    const auto layout = ck::wrapper::make_layout(shape, strides);
-
-    // 0, 1, 2, ..., size(shape) - 1
-    std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
-    std::iota(input_data.begin(), input_data.end(), 0);
-
-    // Global memory buffers
-    DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
-    DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
-
-    in_buf.ToDevice(input_data.data());
-    out_buf.SetZero();
-
-    // Create tensors for global memory
-    const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
-    auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
-
-    const auto thread_layout = ck::make_tuple(ck::Number<1>{}, ck::Number<32>{});
-    const auto tile_shape    = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
-
-    const ck::index_t grid_size = ck::math::integer_divide_ceil(
-        ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
-
-    const auto kernel = TestCopyDevice<decltype(input_tensor_global),
-                                       decltype(output_tensor_global),
-                                       decltype(tile_shape),
-                                       decltype(thread_layout),
-                                       UseOptimizedCopy>;
-    launch_and_time_kernel(StreamConfig{},
-                           kernel,
-                           dim3(grid_size),
-                           dim3(ck::wrapper::size(thread_layout)),
-                           0,
-                           input_tensor_global,
-                           output_tensor_global,
-                           tile_shape,
-                           thread_layout);
-
-    // Verify results
-    std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
-    out_buf.FromDevice(output_data.data());
-    EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
-}
-
-TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
-TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
--- a/test/wrapper/test_gemm.cpp
+++ b/test/wrapper/test_gemm.cpp
@@ -1,257 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <numeric>
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/library/utility/host_tensor.hpp"
-
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/library/utility/fill.hpp"
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-#include "ck/wrapper/operations/copy.hpp"
-#include "ck/wrapper/operations/gemm.hpp"
-
-template <typename DataType>
-void CheckResult(const std::vector<DataType>& a_data,
-                 const std::vector<DataType>& b_data,
-                 std::vector<DataType>& c_m_n_device_result,
-                 const ck::index_t M,
-                 const ck::index_t N,
-                 const ck::index_t K)
-{
-    using PassThrough           = ck::tensor_operation::element_wise::PassThrough;
-    using ReferenceGemmInstance = ck::tensor_operation::host::
-        ReferenceGemm<DataType, DataType, DataType, float, PassThrough, PassThrough, PassThrough>;
-
-    Tensor<DataType> a_m_k(HostTensorDescriptor({M, K}));
-    Tensor<DataType> b_k_n(HostTensorDescriptor({K, N}, {1, K}));
-    Tensor<DataType> c_m_n_host_result(HostTensorDescriptor({M, N}));
-
-    a_m_k.mData = a_data;
-    b_k_n.mData = b_data;
-
-    auto ref_op       = ReferenceGemmInstance{};
-    auto ref_invoker  = ref_op.MakeInvoker();
-    auto ref_argument = ref_op.MakeArgument(
-        a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
-
-    ref_invoker.Run(ref_argument);
-    EXPECT_TRUE(ck::utils::check_err(c_m_n_device_result, c_m_n_host_result.mData));
-}
-
-template <typename DataType,
-          typename GemmTraits,
-          ck::index_t scalar_per_vector,
-          typename BlockShape,
-          typename ThreadLayoutShape>
-__global__ void DeviceGemm(const void* p_a,
-                           const void* p_b,
-                           void* p_c,
-                           const ck::index_t M,
-                           const ck::index_t N,
-                           const ck::index_t K,
-                           const BlockShape tile_shape,
-                           const ThreadLayoutShape thread_layout)
-{
-    constexpr auto MPerBlock = ck::wrapper::size<0>(tile_shape);
-    constexpr auto NPerBlock = ck::wrapper::size<1>(tile_shape);
-    constexpr auto KPerBlock = ck::wrapper::size<2>(tile_shape);
-
-    const auto a_global_layout =
-        ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
-    const auto b_global_layout =
-        ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
-    const auto c_global_layout =
-        ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
-
-    constexpr auto a_tile_layout = ck::wrapper::make_layout(
-        ck::make_tuple(MPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
-    constexpr auto b_tile_layout = ck::wrapper::make_layout(
-        ck::make_tuple(NPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
-    constexpr auto c_tile_layout = ck::wrapper::make_layout(
-        ck::make_tuple(MPerBlock, NPerBlock), ck::make_tuple(NPerBlock, ck::Number<1>{}));
-
-    auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<const DataType*>(p_a), a_global_layout);
-    auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<const DataType*>(p_b), b_global_layout);
-    auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<DataType*>(p_c), c_global_layout);
-
-    auto a_padded_global_tensor = ck::wrapper::pad(a_global_tensor, shape(a_tile_layout));
-    auto b_padded_global_tensor = ck::wrapper::pad(b_global_tensor, shape(b_tile_layout));
-    auto c_padded_global_tensor = ck::wrapper::pad(c_global_tensor, shape(c_tile_layout));
-
-    __shared__ DataType lds_a[ck::wrapper::size(a_tile_layout)];
-    __shared__ DataType lds_b[ck::wrapper::size(b_tile_layout)];
-
-    auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        static_cast<DataType*>(lds_a), a_tile_layout);
-    auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        static_cast<DataType*>(lds_b), b_tile_layout);
-
-    const ck::index_t block_idx      = static_cast<ck::index_t>(blockIdx.x);
-    using DimAccessOrder             = ck::Tuple<ck::Number<0>, ck::Number<1>>;
-    constexpr ck::index_t vector_dim = 1;
-
-    auto c_global_local_tile = ck::wrapper::make_local_tile(
-        c_padded_global_tensor,
-        tile_shape,
-        block_idx,
-        make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(KPerBlock)));
-    auto c_global_local_partition =
-        ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
-                                                               decltype(a_tile_layout),
-                                                               decltype(b_tile_layout),
-                                                               ck::wrapper::size(thread_layout),
-                                                               GemmTraits>(c_global_local_tile);
-    auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
-                                                                  decltype(a_tile_layout),
-                                                                  decltype(b_tile_layout),
-                                                                  ck::wrapper::size(thread_layout),
-                                                                  GemmTraits>();
-    ck::wrapper::clear(c_vgpr_reg);
-
-    const ck::index_t num_loop = ck::math::integer_divide_ceil(K, KPerBlock);
-    ck::index_t i              = 0;
-    do
-    {
-        const auto k_slice = ck::wrapper::slice(i * KPerBlock, (i + 1) * KPerBlock);
-        auto a_padded_global_tensor_k_slice = a_padded_global_tensor(ck::wrapper::slice(), k_slice);
-        auto b_padded_global_tensor_k_slice = b_padded_global_tensor(ck::wrapper::slice(), k_slice);
-        auto a_global_local_tile            = ck::wrapper::make_local_tile(
-            a_padded_global_tensor_k_slice,
-            tile_shape,
-            block_idx,
-            make_tuple(ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}));
-        auto b_global_local_tile = ck::wrapper::make_local_tile(
-            b_padded_global_tensor_k_slice,
-            tile_shape,
-            block_idx,
-            make_tuple(ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}));
-
-        ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
-            a_global_local_tile, a_lds_tensor, thread_layout);
-        ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
-            b_global_local_tile, b_lds_tensor, thread_layout);
-        ck::block_sync_lds();
-        ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
-            a_lds_tensor, b_lds_tensor, c_vgpr_reg);
-
-        ++i;
-    } while(i < num_loop);
-
-    ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
-}
-
-template <typename DataType,
-          typename GemmTraits,
-          ck::index_t scalar_per_vector,
-          typename BlockShape,
-          typename ThreadLayoutShape>
-void PerformGemm(const ck::index_t M,
-                 const ck::index_t N,
-                 const ck::index_t K,
-                 const BlockShape& tile_shape,
-                 const ThreadLayoutShape& thread_layout)
-{
-    // Global memory buffers
-    DeviceMem a_mem(M * K * sizeof(DataType));
-    DeviceMem b_mem(K * N * sizeof(DataType));
-    DeviceMem c_mem(M * N * sizeof(DataType));
-
-    std::vector<DataType> a_data(M * K);
-    std::vector<DataType> b_data(K * N);
-    ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(a_data);
-    ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(b_data);
-
-    a_mem.ToDevice(a_data.data());
-    b_mem.ToDevice(b_data.data());
-    c_mem.SetZero();
-
-    const ck::index_t grid_size =
-        ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape)) *
-        ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
-
-    const auto kernel =
-        DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayoutShape>;
-    launch_and_time_kernel(StreamConfig{nullptr},
-                           kernel,
-                           dim3(grid_size),
-                           dim3(ck::wrapper::size(thread_layout)),
-                           0,
-                           a_mem.GetDeviceBuffer(),
-                           b_mem.GetDeviceBuffer(),
-                           c_mem.GetDeviceBuffer(),
-                           M,
-                           N,
-                           K,
-                           tile_shape,
-                           thread_layout);
-
-    std::vector<DataType> c_data(M * N);
-    c_mem.FromDevice(c_data.data());
-
-    CheckResult<DataType>(a_data, b_data, c_data, M, N, K);
-}
-
-TEST(TestGemm, Float)
-{
-    using DataType           = float;
-    const auto thread_layout = ck::make_tuple(ck::Number<16>{}, ck::Number<16>{});
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 4>(
-        512, 512, 128, tile_shape, thread_layout);
-    // Irregular case
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 1>(
-        129, 129, 67, tile_shape, thread_layout);
-}
-
-TEST(TestGemm, Int8)
-{
-    using DataType           = int8_t;
-    const auto thread_layout = ck::make_tuple(ck::Number<64>{}, ck::Number<4>{});
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 16>(
-        512, 512, 128, tile_shape, thread_layout);
-    // Irregular case
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 1>(
-        129, 129, 67, tile_shape, thread_layout);
-}
-
-TEST(TestGemm, Half)
-{
-    using DataType           = ck::half_t;
-    const auto thread_layout = ck::make_tuple(ck::Number<32>{}, ck::Number<8>{});
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 8>(
-        512, 512, 128, tile_shape, thread_layout);
-    // Irregular case
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 1>(
-        129, 129, 67, tile_shape, thread_layout);
-}
-
-TEST(TestGemm, Float_2x4_4x2_XdlPerWave)
-{
-    using DataType                            = float;
-    const auto thread_layout_4x2_xdl_per_wave = ck::make_tuple(ck::Number<16>{}, ck::Number<8>{});
-    const auto thread_layout_2x4_xdl_per_wave = ck::make_tuple(ck::Number<8>{}, ck::Number<16>{});
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1, 4>(
-        512, 512, 128, tile_shape, thread_layout_4x2_xdl_per_wave);
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x4XdlPerWave_4K1, 4>(
-        512, 512, 128, tile_shape, thread_layout_2x4_xdl_per_wave);
-}
--- a/test/wrapper/test_layout.cpp
+++ b/test/wrapper/test_layout.cpp
@@ -1,474 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/utility/common_header.hpp"
-
-#include "ck/wrapper/layout.hpp"
-
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_description/multi_index_transform_helper.hpp"
-
-class TestWrapperLayout : public ::testing::Test
-{
-    protected:
-    static constexpr auto I0 = ck::Number<0>{};
-    static constexpr auto I1 = ck::Number<1>{};
-
-    template <typename Desc,
-              typename Desc1d,
-              typename LayoutRuntime,
-              typename LayoutCompiletime,
-              typename Idxs>
-    void Run(Desc& desc,
-             Desc1d& desc_1d,
-             LayoutRuntime& layout_runtime,
-             LayoutCompiletime& layout_compiletime,
-             const std::vector<Idxs>& idxs)
-    {
-        // 1d check
-        EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
-        // Check layout compiletime and runtime result consistency
-        EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
-
-        for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
-        {
-            const ck::index_t layout_runtime_offset_1d     = layout_runtime(ck::make_tuple(i));
-            const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
-            const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
-            EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
-            EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
-        }
-        // size(layout)-d check, don't check if access is hierarchical
-        if constexpr(!IsNestedTuple(Idxs{}))
-        {
-            ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
-                EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
-                EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
-                          ck::wrapper::size<d>(layout_compiletime));
-            });
-        }
-        for(const auto idx : idxs)
-        {
-            const ck::index_t layout_runtime_offset     = layout_runtime(idx);
-            const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
-            const ck::index_t desc_offset =
-                desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
-            EXPECT_EQ(layout_runtime_offset, desc_offset);
-            EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
-        }
-    }
-};
-
-TEST_F(TestWrapperLayout, 2d)
-{
-    // dims:(4, 3) strides:(1, 4)
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s1 = 1;
-    constexpr ck::index_t s0 = 4;
-    const auto desc =
-        ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
-                                         ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
-        ck::make_tuple(ck::Sequence<1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
-    const auto layout_compiletime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
-                                 ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
-
-    for(ck::index_t h = 0; h < d1; h++)
-    {
-        for(ck::index_t w = 0; w < d0; w++)
-        {
-            idxs.emplace_back(h, w);
-        }
-    }
-
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
-}
-
-TEST_F(TestWrapperLayout, 3d_nested)
-{
-    // dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s3 = 2;
-    constexpr ck::index_t s2 = 4;
-    constexpr ck::index_t s1 = 12;
-    constexpr ck::index_t s0 = 48;
-    const auto desc          = ck::make_naive_tensor_descriptor(
-        ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
-        ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
-        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_3d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
-                       ck::make_pass_through_transform(d1),
-                       ck::make_pass_through_transform(d2)),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
-    const auto layout_runtime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
-                                 ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
-    const auto layout_compiletime = ck::wrapper::make_layout(
-        ck::make_tuple(
-            ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
-        ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
-                       ck::Number<s1>{},
-                       ck::Number<s0>{}));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
-
-    for(ck::index_t d = 0; d < d2 * d3; d++)
-    {
-        for(ck::index_t h = 0; h < d1; h++)
-        {
-            for(ck::index_t w = 0; w < d0; w++)
-            {
-                idxs_3d.emplace_back(d, h, w);
-            }
-        }
-    }
-    this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
-
-    // Check also 4d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
-
-    for(ck::index_t e = 0; e < d3; e++)
-    {
-        for(ck::index_t d = 0; d < d2; d++)
-        {
-            for(ck::index_t h = 0; h < d1; h++)
-            {
-                for(ck::index_t w = 0; w < d0; w++)
-                {
-                    idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
-                }
-            }
-        }
-    }
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
-}
-
-TEST_F(TestWrapperLayout, 2d_nested)
-{
-    // dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s3 = 2;
-    constexpr ck::index_t s2 = 4;
-    constexpr ck::index_t s1 = 48;
-    constexpr ck::index_t s0 = 12;
-    const auto desc          = ck::make_naive_tensor_descriptor(
-        ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
-        ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
-        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_2d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
-                       ck::make_merge_transform(ck::make_tuple(d0, d1))),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    const auto layout_runtime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
-                                 ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
-                       ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
-        ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
-                       ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
-
-    for(ck::index_t h = 0; h < d2 * d3; h++)
-    {
-        for(ck::index_t w = 0; w < d0 * d1; w++)
-        {
-            idxs_2d.emplace_back(h, w);
-        }
-    }
-    this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
-    // Check also 4d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
-        idxs_4d;
-
-    for(ck::index_t e = 0; e < d3; e++)
-    {
-        for(ck::index_t d = 0; d < d2; d++)
-        {
-            for(ck::index_t h = 0; h < d1; h++)
-            {
-                for(ck::index_t w = 0; w < d0; w++)
-                {
-                    idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
-                }
-            }
-        }
-    }
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
-}
-
-TEST_F(TestWrapperLayout, 3d_double_nested)
-{
-    // dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
-    constexpr ck::index_t d4 = 2;
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s4 = 2;
-    constexpr ck::index_t s3 = 4;
-    constexpr ck::index_t s2 = 8;
-    constexpr ck::index_t s1 = 96;
-    constexpr ck::index_t s0 = 24;
-    const auto desc          = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
-                                                                      ck::Number<d3>{},
-                                                                      ck::Number<d2>{},
-                                                                      ck::Number<d1>{},
-                                                                      ck::Number<d0>{}),
-                                                       ck::make_tuple(ck::Number<s4>{},
-                                                                      ck::Number<s3>{},
-                                                                      ck::Number<s2>{},
-                                                                      ck::Number<s1>{},
-                                                                      ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
-        ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_3d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
-                       ck::make_pass_through_transform(d2),
-                       ck::make_merge_transform(ck::make_tuple(d0, d1))),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
-    const auto desc_2d = transform_tensor_descriptor(
-        desc_3d,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
-                       ck::make_pass_through_transform(d1 * d0)),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    const auto layout_runtime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(
-        ck::make_tuple(
-            ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-            ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
-        ck::make_tuple(
-            ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
-            ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
-
-    for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
-    {
-        for(ck::index_t w = 0; w < d0 * d1; w++)
-        {
-            idxs_2d.emplace_back(h, w);
-        }
-    }
-    this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
-    // Check also 3d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
-
-    for(ck::index_t d = 0; d < d3 * d4; d++)
-    {
-        for(ck::index_t h = 0; h < d2; h++)
-        {
-            for(ck::index_t w = 0; w < d1 * d0; w++)
-            {
-                idxs_3d.emplace_back(ck::make_tuple(d, h), w);
-            }
-        }
-    }
-    this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
-    // Check also 5d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
-                          ck::Tuple<ck::index_t, ck::index_t>>>
-        idxs_5d;
-
-    for(ck::index_t f = 0; f < d4; f++)
-    {
-        for(ck::index_t e = 0; e < d3; e++)
-        {
-            for(ck::index_t d = 0; d < d2; d++)
-            {
-                for(ck::index_t h = 0; h < d1; h++)
-                {
-                    for(ck::index_t w = 0; w < d0; w++)
-                    {
-                        idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
-                                             ck::make_tuple(h, w));
-                    }
-                }
-            }
-        }
-    }
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
-}
-
-TEST(TestLayoutHelpers, SizeAndGet)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4  = 2;
-    constexpr ck::index_t d3  = 2;
-    constexpr ck::index_t d2  = 3;
-    constexpr ck::index_t d1  = 4;
-    constexpr ck::index_t d0  = 3;
-    const auto layout_runtime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
-
-    // Size of layout
-    EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
-    EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
-
-    // Size of dims
-    EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
-    EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
-    EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
-    EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
-
-    // Access through new layout (using get with layout object)
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
-
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
-              d4);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
-              d3);
-
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
-
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
-}
-
-TEST(TestLayoutHelpers, DepthAndRank)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4  = 2;
-    constexpr ck::index_t d3  = 2;
-    constexpr ck::index_t d2  = 3;
-    constexpr ck::index_t d1  = 4;
-    constexpr ck::index_t d0  = 3;
-    const auto layout_runtime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
-
-    EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
-    EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
-    EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
-    // Check for integer
-    EXPECT_EQ(ck::wrapper::depth(d0), 0);
-
-    EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
-    EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
-    EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
-    // Check for integer
-    EXPECT_EQ(ck::wrapper::rank(d0), 1);
-}
-
-TEST(TestLayoutHelpers, ShapeAndStrides)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4     = 2;
-    constexpr ck::index_t d3     = 2;
-    constexpr ck::index_t d2     = 3;
-    constexpr ck::index_t d1     = 4;
-    constexpr ck::index_t d0     = 3;
-    constexpr ck::index_t s4     = 2;
-    constexpr ck::index_t s3     = 4;
-    constexpr ck::index_t s2     = 8;
-    constexpr ck::index_t s1     = 96;
-    constexpr ck::index_t s0     = 24;
-    const auto shape_compiletime = ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
-    const auto strides_compiletime = ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
-        ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
-    const auto shape_runtime =
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
-    const auto strides_runtime =
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
-    const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
-    const auto layout_compiletime =
-        ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
-
-    constexpr bool check_compiletime_shape =
-        std::is_same_v<decltype(shape_compiletime),
-                       std::remove_reference_t<decltype(shape(layout_compiletime))>>;
-    constexpr bool check_runtime_shape =
-        std::is_same_v<decltype(shape_runtime),
-                       std::remove_reference_t<decltype(shape(layout_runtime))>>;
-    EXPECT_TRUE(check_compiletime_shape);
-    EXPECT_TRUE(check_runtime_shape);
-}
-
-TEST(TestLayoutHelpers, Hierarchical)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4 = 2;
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    const auto runtime_shape =
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
-    const auto layout_runtime     = ck::wrapper::make_layout(runtime_shape);
-    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
-
-    EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
-    EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
-    EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
-
-    EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
-    EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
-    EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
-
-    EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
-    EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
-    EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
-
-    EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
-}
--- a/test/wrapper/test_partition.cpp
+++ b/test/wrapper/test_partition.cpp
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <numeric>
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-
-TEST(TestPartition, LocalPartition)
-{
-    const auto shape =
-        ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
-    const auto strides =
-        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
-    const auto layout = ck::wrapper::make_layout(shape, strides);
-
-    std::vector<ck::index_t> data(ck::wrapper::size(layout));
-    std::iota(data.begin(), data.end(), 0);
-
-    const auto tensor =
-        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
-
-    const auto thread_steps  = ck::make_tuple(ck::Number<1>{}, ck::Number<8>{}, ck::Number<1>{});
-    const auto thread_layout = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}, ck::Number<1>{});
-    // 3d partition on 2d shape (calculate partition on 3d thread layout, and then skip first dim)
-    const auto thread_projection =
-        ck::make_tuple(ck::wrapper::slice(4), ck::Number<1>{}, ck::Number<1>{});
-    constexpr ck::index_t projection_thread_length = ck::Number<4>{};
-
-    for(ck::index_t thread_id = 0;
-        thread_id < ck::wrapper::size(thread_layout) / projection_thread_length;
-        thread_id++)
-    {
-        const auto packed_partition =
-            ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_projection);
-
-        const auto expected_partition_size =
-            ck::wrapper::size(tensor) /
-            (ck::wrapper::size(thread_layout) / projection_thread_length);
-        const auto expected_partition_first_val  = thread_id * ck::wrapper::size<1>(thread_steps);
-        const auto expected_partition_second_val = expected_partition_first_val + 1;
-        EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
-        EXPECT_EQ(packed_partition(0), expected_partition_first_val);
-        EXPECT_EQ(packed_partition(1), expected_partition_second_val);
-    }
-}
-
-TEST(TestPartition, LocalTile)
-{
-    const auto shape   = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
-    const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
-    const auto layout  = ck::wrapper::make_layout(shape, strides);
-
-    std::vector<ck::index_t> data(ck::wrapper::size(layout));
-    std::iota(data.begin(), data.end(), 0);
-
-    const auto tensor =
-        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
-    // 4d tile partitioning on 3d shape (calculate tile on 4d tile layout, and then skip last dim)
-    const auto block_shape =
-        ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{}, ck::Number<2>{});
-    const auto block_projection =
-        ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(2));
-    constexpr ck::index_t projection_block_dim = ck::Number<2>{};
-    const auto num_blocks =
-        ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
-                       ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
-                       ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
-    std::vector<ck::index_t> block_idxs(ck::wrapper::size(num_blocks));
-    std::iota(block_idxs.begin(), block_idxs.end(), 0);
-
-    for(auto block_idx : block_idxs)
-    {
-        const auto packed_tile =
-            ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_projection);
-
-        const auto expected_tile_size = ck::wrapper::size(block_shape) / projection_block_dim;
-        auto expected_tile_first_val  = (block_idx % ck::wrapper::size<2>(num_blocks)) *
-                                       ck::wrapper::size<2>(block_shape) *
-                                       ck::wrapper::size<2>(strides);
-        block_idx /= ck::wrapper::size<2>(num_blocks);
-        expected_tile_first_val += (block_idx % ck::wrapper::size<1>(num_blocks)) *
-                                   ck::wrapper::size<1>(block_shape) *
-                                   ck::wrapper::size<1>(strides);
-        block_idx /= ck::wrapper::size<1>(num_blocks);
-        expected_tile_first_val += (block_idx % ck::wrapper::size<0>(num_blocks)) *
-                                   ck::wrapper::size<0>(block_shape) *
-                                   ck::wrapper::size<0>(strides);
-
-        const auto expected_tile_second_val = expected_tile_first_val + 1;
-        EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
-        EXPECT_EQ(packed_tile(0), expected_tile_first_val);
-        EXPECT_EQ(packed_tile(1), expected_tile_second_val);
-    }
-}
--- a/test/wrapper/test_tensor.cpp
+++ b/test/wrapper/test_tensor.cpp
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/library/utility/device_memory.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-
-#include "ck/utility/common_header.hpp"
-
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-
-// Compare data in tensor with offset from layout.
-// Data and offset should match if physical memory has been initialized with
-// sequentially increasing values from 0.
-template <typename TensorType>
-__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
-{
-    const auto& layout = ck::wrapper::layout(tensor);
-    for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
-    {
-        for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
-        {
-            for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
-            {
-                const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
-                if(tensor(idx) != layout(idx))
-                {
-                    return false;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-template <typename TensorType>
-__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
-{
-    const auto& layout = ck::wrapper::layout(tensor);
-    for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
-    {
-        if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-template <ck::index_t nelems, typename TensorType>
-__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
-{
-    const auto& layout = ck::wrapper::layout(tensor);
-    bool success       = true;
-    ck::static_for<0, nelems, 1>{}([&](auto w) {
-        if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
-        {
-            success = false;
-        }
-    });
-    return success;
-}
-
-template <typename TensorType>
-__host__ __device__ void InitTensor(TensorType& tensor)
-{
-    for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
-    {
-        tensor(i) = i;
-    }
-}
-
-template <ck::index_t nelems, typename TensorType>
-__host__ __device__ void StaticInitTensor(TensorType& tensor)
-{
-
-    ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
-}
-
-// Tests
-TEST(TestTensor, ReadWriteHostMemory)
-{
-    constexpr ck::index_t nelems = 8;
-
-    std::array<ck::index_t, nelems> data;
-    const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
-    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
-    InitTensor(tensor);
-
-    EXPECT_TRUE(TestTensorCheck1d(tensor));
-    EXPECT_TRUE(TestTensorCheck3d(tensor));
-}
-
-__global__ void TestTensorReadWriteDevice(void* data, void* success)
-{
-    constexpr ck::index_t nelems = 8;
-    __shared__ ck::index_t p_shared[nelems];
-
-    ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
-    bool* casted_success_ptr     = static_cast<bool*>(success);
-
-    const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
-    constexpr auto vgpr_layout =
-        ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
-
-    auto tensor_global =
-        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
-    auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
-    auto tensor_vgpr =
-        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
-            vgpr_layout);
-
-    InitTensor(tensor_global);
-    InitTensor(tensor_lds);
-    StaticInitTensor<nelems>(tensor_vgpr);
-
-    *casted_success_ptr = TestTensorCheck1d(tensor_global);
-    *casted_success_ptr &= TestTensorCheck3d(tensor_global);
-
-    *casted_success_ptr &= TestTensorCheck1d(tensor_lds);
-    *casted_success_ptr &= TestTensorCheck3d(tensor_lds);
-
-    *casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
-}
-
-TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
-{
-    constexpr ck::index_t nelems = 8;
-    std::array<ck::index_t, nelems> host_data;
-
-    DeviceMem data_buf(nelems * sizeof(ck::index_t));
-    data_buf.ToDevice(&host_data[0]);
-    DeviceMem success_buf(sizeof(bool));
-
-    launch_and_time_kernel(StreamConfig{},
-                           TestTensorReadWriteDevice,
-                           dim3(1),
-                           dim3(1),
-                           0,
-                           data_buf.GetDeviceBuffer(),
-                           success_buf.GetDeviceBuffer());
-
-    bool success;
-    success_buf.FromDevice(&success);
-    EXPECT_TRUE(success);
-}
-
-TEST(TestTensor, Slicing)
-{
-    constexpr ck::index_t nelems = 8;
-
-    std::array<ck::index_t, nelems> data;
-    const auto shape   = ck::make_tuple(ck::make_tuple(2, 2), 2);
-    const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
-    const auto layout  = ck::wrapper::make_layout(shape, strides);
-    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
-    InitTensor(tensor);
-
-    auto tensor2x2x2 =
-        tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
-    EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
-    EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
-    EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
-
-    auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
-    EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
-    EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
-    EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
-
-    auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
-    EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
-    EXPECT_EQ(rank(tensor1x1), 2);
-    EXPECT_EQ(depth(tensor1x1), 2);
-    EXPECT_EQ(size(tensor1x1), 1);
-    EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
-
-    auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
-    EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
-    EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
-    EXPECT_EQ(ck::wrapper::size(tensor2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2));
-
-    auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
-    EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
-    EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
-    EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
-
-    // negative indexing
-    auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
-    EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
-    EXPECT_EQ(rank(tensor1x2), 2);
-    EXPECT_EQ(depth(tensor1x2), 2);
-    EXPECT_EQ(size(tensor1x2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
-}
--- a/test/wrapper/test_wrapper_copy.cpp
+++ b/test/wrapper/test_wrapper_copy.cpp
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <numeric>
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-#include "ck/wrapper/operations/copy.hpp"
-
-// Test copy from Global to Global through LDS and VGPR
-template <typename InputTensor,
-          typename OutputTensor,
-          typename BlockShape,
-          typename ThreadLayout,
-          bool UseOptimizedCopy>
-__global__ void TestCopyDevice(const InputTensor input_tensor,
-                               OutputTensor output_tensor,
-                               const BlockShape tile_shape,
-                               const ThreadLayout thread_layout)
-{
-    __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
-    const auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        p_shared, ck::wrapper::make_layout(tile_shape));
-
-    const auto block_idxs =
-        ck::make_tuple(static_cast<ck::index_t>(blockIdx.x), static_cast<ck::index_t>(blockIdx.y));
-
-    // Get local tiles for global memory
-    const auto input_local_tile =
-        ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs);
-    const auto output_local_tile =
-        ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs);
-
-    // Get partition per thread
-    const auto input_local_partition =
-        ck::wrapper::make_local_partition(input_local_tile, thread_layout, threadIdx.x);
-    auto lds_local_partition =
-        ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x);
-    auto output_local_partition =
-        ck::wrapper::make_local_partition(output_local_tile, thread_layout, threadIdx.x);
-
-    // Allocate VGPR
-    auto tensor_vgpr =
-        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
-            ck::wrapper::make_layout(shape(lds_local_partition)));
-
-    // Perform copy
-    if constexpr(UseOptimizedCopy)
-    {
-        using DimAccessOrder                    = ck::Tuple<ck::Number<1>, ck::Number<0>>;
-        constexpr ck::index_t vector_dim        = 0;
-        constexpr ck::index_t scalar_per_vector = 2;
-        ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(input_local_partition,
-                                                                         lds_local_partition);
-        // TODO: Enable optimized copy for static buffers
-        ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(lds_local_partition,
-                                                                         tensor_vgpr);
-        ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(tensor_vgpr,
-                                                                         output_local_partition);
-    }
-    else
-    {
-        ck::wrapper::copy(input_local_partition, lds_local_partition);
-        ck::wrapper::copy(lds_local_partition, tensor_vgpr);
-        ck::wrapper::copy(tensor_vgpr, output_local_partition);
-    }
-}
-
-template <bool UseOptimizedCopy>
-void PerformCopyGlobalToGlobalViaLDS()
-{
-    const auto shape =
-        ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
-    const auto strides =
-        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
-    const auto layout = ck::wrapper::make_layout(shape, strides);
-
-    // 0, 1, 2, ..., size(shape) - 1
-    std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
-    std::iota(input_data.begin(), input_data.end(), 0);
-
-    // Global memory buffers
-    DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
-    DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
-
-    in_buf.ToDevice(input_data.data());
-    out_buf.SetZero();
-
-    // Create tensors for global memory
-    const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
-    auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
-
-    const auto thread_layout =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<1>{}, ck::Number<32>{}));
-    const auto tile_shape = ck::make_tuple(ck::Number<4>{}, ck::Number<64>{});
-
-    const ck::index_t grid_size_x = ck::math::integer_divide_ceil(
-        ck::wrapper::size<0>(input_tensor_global), ck::wrapper::size<0>(tile_shape));
-    const ck::index_t grid_size_y = ck::math::integer_divide_ceil(
-        ck::wrapper::size<1>(input_tensor_global), ck::wrapper::size<1>(tile_shape));
-
-    const auto kernel = TestCopyDevice<decltype(input_tensor_global),
-                                       decltype(output_tensor_global),
-                                       decltype(tile_shape),
-                                       decltype(thread_layout),
-                                       UseOptimizedCopy>;
-    launch_and_time_kernel(StreamConfig{},
-                           kernel,
-                           dim3(grid_size_x, grid_size_y, 1),
-                           dim3(ck::wrapper::size(thread_layout)),
-                           0,
-                           input_tensor_global,
-                           output_tensor_global,
-                           tile_shape,
-                           thread_layout);
-
-    // Verify results
-    std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
-    out_buf.FromDevice(output_data.data());
-    EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
-}
-
-TEST(TestCopyGlobalToGlobalViaLDS, GenericCopy) { PerformCopyGlobalToGlobalViaLDS<false>(); }
-TEST(TestCopyGlobalToGlobalViaLDS, OptimizedCopy) { PerformCopyGlobalToGlobalViaLDS<true>(); }
--- a/test/wrapper/test_wrapper_gemm.cpp
+++ b/test/wrapper/test_wrapper_gemm.cpp
@@ -1,376 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <numeric>
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/library/utility/host_tensor.hpp"
-
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/library/utility/fill.hpp"
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-#include "ck/wrapper/operations/copy.hpp"
-#include "ck/wrapper/operations/gemm.hpp"
-#include "ck/wrapper/utils/kernel_utils.hpp"
-
-template <typename DataType>
-void CheckResult(const std::vector<DataType>& a_data,
-                 const std::vector<DataType>& b_data,
-                 std::vector<DataType>& c_m_n_device_result,
-                 const ck::index_t M,
-                 const ck::index_t N,
-                 const ck::index_t K)
-{
-    using PassThrough           = ck::tensor_operation::element_wise::PassThrough;
-    using ReferenceGemmInstance = ck::tensor_operation::host::
-        ReferenceGemm<DataType, DataType, DataType, float, PassThrough, PassThrough, PassThrough>;
-
-    Tensor<DataType> a_m_k(HostTensorDescriptor({M, K}));
-    Tensor<DataType> b_k_n(HostTensorDescriptor({K, N}, {1, K}));
-    Tensor<DataType> c_m_n_host_result(HostTensorDescriptor({M, N}));
-
-    a_m_k.mData = a_data;
-    b_k_n.mData = b_data;
-
-    auto ref_op       = ReferenceGemmInstance{};
-    auto ref_invoker  = ref_op.MakeInvoker();
-    auto ref_argument = ref_op.MakeArgument(
-        a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
-
-    ref_invoker.Run(ref_argument);
-    EXPECT_TRUE(ck::utils::check_err(c_m_n_device_result, c_m_n_host_result.mData));
-}
-
-template <bool DoPad, typename Layout, typename PaddingDims>
-__device__ auto ApplyPadding(const Layout& layout, const PaddingDims& padding_dims)
-{
-    if constexpr(DoPad)
-    {
-        return ck::wrapper::pad(layout, padding_dims);
-    }
-    else
-    {
-        return layout;
-    }
-}
-
-template <typename DataType,
-          typename GemmTraits,
-          ck::index_t scalar_per_vector,
-          typename BlockShape,
-          typename ThreadLayout,
-          bool DoPadding>
-__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
-                                                        const void* p_b,
-                                                        void* p_c,
-                                                        const ck::index_t M,
-                                                        const ck::index_t N,
-                                                        const ck::index_t K,
-                                                        const BlockShape tile_shape,
-                                                        const ThreadLayout thread_layout)
-{
-    constexpr auto MPerBlock  = ck::wrapper::size<0>(tile_shape);
-    constexpr auto NPerBlock  = ck::wrapper::size<1>(tile_shape);
-    constexpr auto KPerBlock  = ck::wrapper::size<2>(tile_shape);
-    constexpr auto K1         = GemmTraits::K1;
-    constexpr auto K0PerBlock = KPerBlock / K1;
-    const auto K0             = ck::math::integer_divide_ceil(K, K1);
-
-    const auto tile_shape_k0_m_n_k1 = ck::make_tuple(K0PerBlock, MPerBlock, NPerBlock, K1);
-
-    const auto a_global_layout =
-        ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
-    const auto b_global_layout =
-        ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
-    const auto c_global_layout =
-        ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
-
-    auto a_padded_global_layout =
-        ApplyPadding<DoPadding>(a_global_layout, ck::make_tuple(MPerBlock, KPerBlock));
-    auto b_padded_global_layout =
-        ApplyPadding<DoPadding>(b_global_layout, ck::make_tuple(NPerBlock, KPerBlock));
-    auto c_padded_global_layout =
-        ApplyPadding<DoPadding>(c_global_layout, ck::make_tuple(MPerBlock, NPerBlock));
-
-    // Reshape from M,K to K0,M,K1
-    const auto reshaped_dims_idxs =
-        ck::make_tuple(ck::Number<1>{}, ck::make_tuple(ck::Number<0>{}, ck::Number<2>{}));
-    auto a_padded_unmerged_global_layout =
-        ck::wrapper::unmerge<1>(a_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
-    auto b_padded_unmerged_global_layout =
-        ck::wrapper::unmerge<1>(b_padded_global_layout, ck::make_tuple(K0, K1), reshaped_dims_idxs);
-
-    auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<const DataType*>(p_a), a_padded_unmerged_global_layout);
-    auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<const DataType*>(p_b), b_padded_unmerged_global_layout);
-    auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
-        static_cast<DataType*>(p_c), c_padded_global_layout);
-
-    // Add extra M and N
-    constexpr auto a_tile_layout = ck::wrapper::make_layout(
-        ck::make_tuple(K0PerBlock, MPerBlock, K1),
-        ck::make_tuple((MPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
-    constexpr auto b_tile_layout = ck::wrapper::make_layout(
-        ck::make_tuple(K0PerBlock, NPerBlock, K1),
-        ck::make_tuple((NPerBlock + ck::Number<1>{}) * K1, K1, ck::Number<1>{}));
-
-    __shared__ DataType lds_a[ck::wrapper::size(a_tile_layout) + NPerBlock];
-    __shared__ DataType lds_b[ck::wrapper::size(b_tile_layout) + NPerBlock];
-
-    auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        static_cast<DataType*>(lds_a), a_tile_layout);
-    auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        static_cast<DataType*>(lds_b), b_tile_layout);
-
-    const auto block_idxs            = ck::make_tuple(ck::wrapper::slice(),
-                                           static_cast<ck::index_t>(blockIdx.x),
-                                           static_cast<ck::index_t>(blockIdx.y),
-                                           ck::wrapper::slice());
-    using DimAccessOrder             = ck::Tuple<ck::Number<1>, ck::Number<0>, ck::Number<2>>;
-    constexpr ck::index_t vector_dim = 2;
-
-    auto c_global_local_tile =
-        ck::wrapper::make_local_tile(c_global_tensor,
-                                     tile_shape_k0_m_n_k1,
-                                     block_idxs,
-                                     make_tuple(ck::wrapper::slice(K0PerBlock),
-                                                ck::Number<1>{},
-                                                ck::Number<1>{},
-                                                ck::wrapper::slice(K1)));
-    auto c_global_local_partition =
-        ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
-                                                               decltype(a_tile_layout),
-                                                               decltype(b_tile_layout),
-                                                               ck::wrapper::size(thread_layout),
-                                                               GemmTraits>(c_global_local_tile);
-    auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
-                                                                  decltype(a_tile_layout),
-                                                                  decltype(b_tile_layout),
-                                                                  ck::wrapper::size(thread_layout),
-                                                                  GemmTraits>();
-    ck::wrapper::clear(c_vgpr_reg);
-
-    auto a_lds_tensor_local_partition =
-        ck::wrapper::make_local_partition(a_lds_tensor, thread_layout, threadIdx.x);
-    auto b_lds_tensor_local_partition =
-        ck::wrapper::make_local_partition(b_lds_tensor, thread_layout, threadIdx.x);
-
-    auto make_global_partition = [&](auto tensor, auto projection, ck::index_t i) {
-        const auto k_slice =
-            ck::make_tuple(ck::wrapper::slice(i * K0PerBlock, (i + 1) * K0PerBlock),
-                           ck::wrapper::slice(),
-                           ck::wrapper::slice());
-        auto local_tile = ck::wrapper::make_local_tile(
-            tensor(k_slice), tile_shape_k0_m_n_k1, block_idxs, projection);
-        return ck::wrapper::make_local_partition(local_tile, thread_layout, threadIdx.x);
-    };
-
-    auto a_global_local_partition = make_global_partition(
-        a_global_tensor,
-        make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
-        0);
-    auto b_global_local_partition = make_global_partition(
-        b_global_tensor,
-        make_tuple(ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
-        0);
-
-    // (row-major vgpr layout)
-    auto a_vgpr_tensor =
-        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
-            ck::wrapper::make_layout(
-                shape(a_global_local_partition),
-                ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
-                                   ck::wrapper::size<2>(a_global_local_partition),
-                               ck::wrapper::size<2>(a_global_local_partition),
-                               ck::Number<1>{})));
-    auto b_vgpr_tensor =
-        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, DataType>(
-            ck::wrapper::make_layout(
-                shape(b_global_local_partition),
-                ck::make_tuple(ck::wrapper::size<1>(a_global_local_partition) *
-                                   ck::wrapper::size<2>(a_global_local_partition),
-                               ck::wrapper::size<2>(a_global_local_partition),
-                               ck::Number<1>{})));
-
-    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_global_local_partition,
-                                                                     a_vgpr_tensor);
-    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_global_local_partition,
-                                                                     b_vgpr_tensor);
-    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(a_vgpr_tensor,
-                                                                     a_lds_tensor_local_partition);
-    ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(b_vgpr_tensor,
-                                                                     b_lds_tensor_local_partition);
-
-    const ck::index_t num_loop =
-        __builtin_amdgcn_readfirstlane(ck::math::integer_divide_ceil(K, KPerBlock));
-    if(num_loop > 1)
-    {
-        ck::index_t i = 0;
-        do
-        {
-            auto a_global_local_partition_i = make_global_partition(
-                a_global_tensor,
-                make_tuple(
-                    ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}),
-                i + 1);
-            auto b_global_local_partition_i = make_global_partition(
-                b_global_tensor,
-                make_tuple(
-                    ck::Number<1>{}, ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}),
-                i + 1);
-
-            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
-                a_global_local_partition_i, a_vgpr_tensor);
-
-            ck::block_sync_lds();
-            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
-                b_global_local_partition_i, b_vgpr_tensor);
-
-            ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
-                a_lds_tensor, b_lds_tensor, c_vgpr_reg);
-
-            ck::block_sync_lds();
-            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
-                a_vgpr_tensor, a_lds_tensor_local_partition);
-            ck::wrapper::copy<DimAccessOrder, vector_dim, scalar_per_vector>(
-                b_vgpr_tensor, b_lds_tensor_local_partition);
-
-            ++i;
-        } while(i < (num_loop - 1));
-    }
-    ck::block_sync_lds();
-    ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
-        a_lds_tensor, b_lds_tensor, c_vgpr_reg);
-
-    ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
-}
-
-template <typename DataType,
-          typename GemmTraits,
-          ck::index_t scalar_per_vector,
-          bool DoPadding,
-          typename BlockShape,
-          typename ThreadLayout>
-void PerformGemm(const ck::index_t M,
-                 const ck::index_t N,
-                 const ck::index_t K,
-                 const BlockShape& tile_shape,
-                 const ThreadLayout& thread_layout)
-{
-    // Global memory buffers
-    DeviceMem a_mem(M * K * sizeof(DataType));
-    DeviceMem b_mem(K * N * sizeof(DataType));
-    DeviceMem c_mem(M * N * sizeof(DataType));
-
-    std::vector<DataType> a_data(M * K);
-    std::vector<DataType> b_data(K * N);
-    ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(a_data);
-    ck::utils::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(b_data);
-
-    a_mem.ToDevice(a_data.data());
-    b_mem.ToDevice(b_data.data());
-    c_mem.SetZero();
-
-    const ck::index_t grid_size_x =
-        ck::math::integer_divide_ceil(M, ck::wrapper::size<0>(tile_shape));
-    const ck::index_t grid_size_y =
-        ck::math::integer_divide_ceil(N, ck::wrapper::size<1>(tile_shape));
-
-    const auto kernel =
-        DeviceGemm<DataType, GemmTraits, scalar_per_vector, BlockShape, ThreadLayout, DoPadding>;
-    const float avg_time = launch_and_time_kernel(StreamConfig{nullptr, true},
-                                                  kernel,
-                                                  dim3(grid_size_x, grid_size_y, 1),
-                                                  dim3(ck::wrapper::size(thread_layout)),
-                                                  0,
-                                                  a_mem.GetDeviceBuffer(),
-                                                  b_mem.GetDeviceBuffer(),
-                                                  c_mem.GetDeviceBuffer(),
-                                                  M,
-                                                  N,
-                                                  K,
-                                                  tile_shape,
-                                                  thread_layout);
-    std::size_t flop     = std::size_t(2) * M * N * K;
-    std::size_t num_btype =
-        sizeof(DataType) * M * K + sizeof(DataType) * K * N + sizeof(DataType) * M * N;
-
-    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-    float gb_per_sec = num_btype / 1.E6 / avg_time;
-
-    std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
-              << gb_per_sec << " GB/s, " << std::endl;
-
-    std::vector<DataType> c_data(M * N);
-    c_mem.FromDevice(c_data.data());
-    CheckResult<DataType>(a_data, b_data, c_data, M, N, K);
-}
-
-TEST(TestGemm, Float)
-{
-    using DataType = float;
-    // (dim1, dim2, dim0 thread layout)
-    const auto thread_layout =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
-                                 ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<16>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 4, false>(
-        512, 512, 128, tile_shape, thread_layout);
-    // Irregular case
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1, 1, true>(
-        129, 129, 67, tile_shape, thread_layout);
-}
-
-TEST(TestGemm, Int8)
-{
-    using DataType = int8_t;
-    const auto thread_layout =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
-                                 ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<64>{});
-    PerformGemm<DataType,
-                ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1,
-                16,
-                false>(512, 512, 128, tile_shape, thread_layout);
-    // Irregular case
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1, 1, true>(
-        129, 129, 67, tile_shape, thread_layout);
-}
-
-TEST(TestGemm, Half)
-{
-    using DataType = ck::half_t;
-    const auto thread_layout =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
-                                 ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
-    const auto tile_shape = ck::make_tuple(ck::Number<128>{}, ck::Number<128>{}, ck::Number<32>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 8, false>(
-        512, 512, 128, tile_shape, thread_layout);
-    // Irregular case
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1, 1, true>(
-        129, 129, 67, tile_shape, thread_layout);
-}
-
-TEST(TestGemm, Float_2x4_4x2_XdlPerWave)
-{
-    using DataType = float;
-    const auto thread_layout =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
-                                 ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}, ck::Number<1>{}));
-    const auto tile_shape = ck::make_tuple(ck::Number<256>{}, ck::Number<128>{}, ck::Number<16>{});
-    PerformGemm<DataType, ck::wrapper::BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1, 4, false>(
-        512, 512, 128, tile_shape, thread_layout);
-}
--- a/test/wrapper/test_wrapper_layout.cpp
+++ b/test/wrapper/test_wrapper_layout.cpp
@@ -1,474 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/utility/common_header.hpp"
-
-#include "ck/wrapper/layout.hpp"
-
-#include "ck/tensor_description/tensor_descriptor.hpp"
-#include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_description/multi_index_transform_helper.hpp"
-
-class TestWrapperLayout : public ::testing::Test
-{
-    protected:
-    static constexpr auto I0 = ck::Number<0>{};
-    static constexpr auto I1 = ck::Number<1>{};
-
-    template <typename Desc,
-              typename Desc1d,
-              typename LayoutRuntime,
-              typename LayoutCompiletime,
-              typename Idxs>
-    void Run(Desc& desc,
-             Desc1d& desc_1d,
-             LayoutRuntime& layout_runtime,
-             LayoutCompiletime& layout_compiletime,
-             const std::vector<Idxs>& idxs)
-    {
-        // 1d check
-        EXPECT_EQ(desc_1d.GetLength(I0), ck::wrapper::size(layout_runtime));
-        // Check layout compiletime and runtime result consistency
-        EXPECT_EQ(ck::wrapper::size(layout_runtime), ck::wrapper::size(layout_compiletime));
-
-        for(ck::index_t i = 0; i < desc_1d.GetLength(I0); i++)
-        {
-            const ck::index_t layout_runtime_offset_1d     = layout_runtime(ck::make_tuple(i));
-            const ck::index_t layout_compiletime_offset_1d = layout_compiletime(ck::make_tuple(i));
-            const ck::index_t desc_offset_1d = desc_1d.CalculateOffset(ck::make_tuple(i));
-            EXPECT_EQ(layout_runtime_offset_1d, desc_offset_1d);
-            EXPECT_EQ(layout_compiletime_offset_1d, layout_runtime_offset_1d);
-        }
-        // size(layout)-d check, don't check if access is hierarchical
-        if constexpr(!IsNestedTuple(Idxs{}))
-        {
-            ck::static_for<0, Idxs::Size(), 1>{}([&](auto d) {
-                EXPECT_EQ(desc.GetLength(ck::Number<d>{}), ck::wrapper::size<d>(layout_runtime));
-                EXPECT_EQ(ck::wrapper::size<d>(layout_runtime),
-                          ck::wrapper::size<d>(layout_compiletime));
-            });
-        }
-        for(const auto idx : idxs)
-        {
-            const ck::index_t layout_runtime_offset     = layout_runtime(idx);
-            const ck::index_t layout_compiletime_offset = layout_compiletime(idx);
-            const ck::index_t desc_offset =
-                desc.CalculateOffset(UnrollNestedTuple(idx)); // Unroll if nested
-            EXPECT_EQ(layout_runtime_offset, desc_offset);
-            EXPECT_EQ(layout_runtime_offset, layout_compiletime_offset);
-        }
-    }
-};
-
-TEST_F(TestWrapperLayout, 2d)
-{
-    // dims:(4, 3) strides:(1, 4)
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s1 = 1;
-    constexpr ck::index_t s0 = 4;
-    const auto desc =
-        ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
-                                         ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1))),
-        ck::make_tuple(ck::Sequence<1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
-    const auto layout_compiletime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
-                                 ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
-
-    for(ck::index_t h = 0; h < d1; h++)
-    {
-        for(ck::index_t w = 0; w < d0; w++)
-        {
-            idxs.emplace_back(h, w);
-        }
-    }
-
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs);
-}
-
-TEST_F(TestWrapperLayout, 3d_nested)
-{
-    // dims:((2, 3), 4, 3) strides:((2, 4), 12, 48)
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s3 = 2;
-    constexpr ck::index_t s2 = 4;
-    constexpr ck::index_t s1 = 12;
-    constexpr ck::index_t s0 = 48;
-    const auto desc          = ck::make_naive_tensor_descriptor(
-        ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
-        ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
-        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_3d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
-                       ck::make_pass_through_transform(d1),
-                       ck::make_pass_through_transform(d2)),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<3>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
-    const auto layout_runtime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), d1, d0),
-                                 ck::make_tuple(ck::make_tuple(s3, s2), s1, s0));
-    const auto layout_compiletime = ck::wrapper::make_layout(
-        ck::make_tuple(
-            ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}), ck::Number<d1>{}, ck::Number<d0>{}),
-        ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
-                       ck::Number<s1>{},
-                       ck::Number<s0>{}));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t>> idxs_3d;
-
-    for(ck::index_t d = 0; d < d2 * d3; d++)
-    {
-        for(ck::index_t h = 0; h < d1; h++)
-        {
-            for(ck::index_t w = 0; w < d0; w++)
-            {
-                idxs_3d.emplace_back(d, h, w);
-            }
-        }
-    }
-    this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
-
-    // Check also 4d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t, ck::index_t>> idxs_4d;
-
-    for(ck::index_t e = 0; e < d3; e++)
-    {
-        for(ck::index_t d = 0; d < d2; d++)
-        {
-            for(ck::index_t h = 0; h < d1; h++)
-            {
-                for(ck::index_t w = 0; w < d0; w++)
-                {
-                    idxs_4d.emplace_back(ck::make_tuple(e, d), h, w);
-                }
-            }
-        }
-    }
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
-}
-
-TEST_F(TestWrapperLayout, 2d_nested)
-{
-    // dims:((2, 3), (4, 3)) strides:((2, 4), (48, 12))
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s3 = 2;
-    constexpr ck::index_t s2 = 4;
-    constexpr ck::index_t s1 = 48;
-    constexpr ck::index_t s0 = 12;
-    const auto desc          = ck::make_naive_tensor_descriptor(
-        ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}, ck::Number<d1>{}, ck::Number<d0>{}),
-        ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}, ck::Number<s1>{}, ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3))),
-        ck::make_tuple(ck::Sequence<3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_2d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3)),
-                       ck::make_merge_transform(ck::make_tuple(d0, d1))),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<3, 2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    const auto layout_runtime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(d3, d2), ck::make_tuple(d1, d0)),
-                                 ck::make_tuple(ck::make_tuple(s3, s2), ck::make_tuple(s1, s0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::Number<d3>{}, ck::Number<d2>{}),
-                       ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
-        ck::make_tuple(ck::make_tuple(ck::Number<s3>{}, ck::Number<s2>{}),
-                       ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
-
-    for(ck::index_t h = 0; h < d2 * d3; h++)
-    {
-        for(ck::index_t w = 0; w < d0 * d1; w++)
-        {
-            idxs_2d.emplace_back(h, w);
-        }
-    }
-    this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
-    // Check also 4d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::Tuple<ck::index_t, ck::index_t>>>
-        idxs_4d;
-
-    for(ck::index_t e = 0; e < d3; e++)
-    {
-        for(ck::index_t d = 0; d < d2; d++)
-        {
-            for(ck::index_t h = 0; h < d1; h++)
-            {
-                for(ck::index_t w = 0; w < d0; w++)
-                {
-                    idxs_4d.emplace_back(ck::make_tuple(e, d), ck::make_tuple(h, w));
-                }
-            }
-        }
-    }
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_4d);
-}
-
-TEST_F(TestWrapperLayout, 3d_double_nested)
-{
-    // dims:(((2, 2), 3), (4, 3)) strides:(((2, 4), 8), (96, 24))
-    constexpr ck::index_t d4 = 2;
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    constexpr ck::index_t s4 = 2;
-    constexpr ck::index_t s3 = 4;
-    constexpr ck::index_t s2 = 8;
-    constexpr ck::index_t s1 = 96;
-    constexpr ck::index_t s0 = 24;
-    const auto desc          = ck::make_naive_tensor_descriptor(ck::make_tuple(ck::Number<d4>{},
-                                                                      ck::Number<d3>{},
-                                                                      ck::Number<d2>{},
-                                                                      ck::Number<d1>{},
-                                                                      ck::Number<d0>{}),
-                                                       ck::make_tuple(ck::Number<s4>{},
-                                                                      ck::Number<s3>{},
-                                                                      ck::Number<s2>{},
-                                                                      ck::Number<s1>{},
-                                                                      ck::Number<s0>{}));
-    // Reverse due to column major
-    const auto desc_1d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d0, d1, d2, d3, d4))),
-        ck::make_tuple(ck::Sequence<4, 3, 2, 1, 0>{}),
-        ck::make_tuple(ck::Sequence<0>{}));
-    const auto desc_3d = transform_tensor_descriptor(
-        desc,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d3, d4)),
-                       ck::make_pass_through_transform(d2),
-                       ck::make_merge_transform(ck::make_tuple(d0, d1))),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}, ck::Sequence<4, 3>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}, ck::Sequence<2>{}));
-    const auto desc_2d = transform_tensor_descriptor(
-        desc_3d,
-        ck::make_tuple(ck::make_merge_transform(ck::make_tuple(d2, d3 * d4)),
-                       ck::make_pass_through_transform(d1 * d0)),
-        ck::make_tuple(ck::Sequence<1, 0>{}, ck::Sequence<2>{}),
-        ck::make_tuple(ck::Sequence<0>{}, ck::Sequence<1>{}));
-    const auto layout_runtime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)),
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, s3), s2), ck::make_tuple(s1, s0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(
-        ck::make_tuple(
-            ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-            ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})),
-        ck::make_tuple(
-            ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
-            ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{})));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs_2d;
-
-    for(ck::index_t h = 0; h < d2 * d3 * d4; h++)
-    {
-        for(ck::index_t w = 0; w < d0 * d1; w++)
-        {
-            idxs_2d.emplace_back(h, w);
-        }
-    }
-    this->Run(desc_2d, desc_1d, layout_runtime, layout_compiletime, idxs_2d);
-    // Check also 3d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> idxs_3d;
-
-    for(ck::index_t d = 0; d < d3 * d4; d++)
-    {
-        for(ck::index_t h = 0; h < d2; h++)
-        {
-            for(ck::index_t w = 0; w < d1 * d0; w++)
-            {
-                idxs_3d.emplace_back(ck::make_tuple(d, h), w);
-            }
-        }
-    }
-    this->Run(desc_3d, desc_1d, layout_runtime, layout_compiletime, idxs_3d);
-    // Check also 5d iteration
-    std::vector<ck::Tuple<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>,
-                          ck::Tuple<ck::index_t, ck::index_t>>>
-        idxs_5d;
-
-    for(ck::index_t f = 0; f < d4; f++)
-    {
-        for(ck::index_t e = 0; e < d3; e++)
-        {
-            for(ck::index_t d = 0; d < d2; d++)
-            {
-                for(ck::index_t h = 0; h < d1; h++)
-                {
-                    for(ck::index_t w = 0; w < d0; w++)
-                    {
-                        idxs_5d.emplace_back(ck::make_tuple(ck::make_tuple(f, e), d),
-                                             ck::make_tuple(h, w));
-                    }
-                }
-            }
-        }
-    }
-    this->Run(desc, desc_1d, layout_runtime, layout_compiletime, idxs_5d);
-}
-
-TEST(TestLayoutHelpers, SizeAndGet)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4  = 2;
-    constexpr ck::index_t d3  = 2;
-    constexpr ck::index_t d2  = 3;
-    constexpr ck::index_t d1  = 4;
-    constexpr ck::index_t d0  = 3;
-    const auto layout_runtime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
-
-    // Size of layout
-    EXPECT_EQ(ck::wrapper::size(layout_runtime), d4 * d3 * d2 * d1 * d0);
-    EXPECT_EQ(ck::wrapper::size(layout_compiletime), d4 * d3 * d2 * d1 * d0);
-
-    // Size of dims
-    EXPECT_EQ(ck::wrapper::size<0>(layout_runtime), d4 * d3 * d2);
-    EXPECT_EQ(ck::wrapper::size<0>(layout_compiletime), d4 * d3 * d2);
-    EXPECT_EQ(ck::wrapper::size<1>(layout_runtime), d1 * d0);
-    EXPECT_EQ(ck::wrapper::size<1>(layout_compiletime), d1 * d0);
-
-    // Access through new layout (using get with layout object)
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_runtime)), d4 * d3);
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(layout_compiletime)), d4 * d3);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
-
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d4);
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
-              d4);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_runtime))), d3);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(ck::wrapper::get<0>(layout_compiletime))),
-              d3);
-
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_runtime)), d2);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<0>(layout_compiletime)), d2);
-
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_runtime)), d1);
-    EXPECT_EQ(ck::wrapper::size<0>(ck::wrapper::get<1>(layout_compiletime)), d1);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_runtime)), d0);
-    EXPECT_EQ(ck::wrapper::size<1>(ck::wrapper::get<1>(layout_compiletime)), d0);
-}
-
-TEST(TestLayoutHelpers, DepthAndRank)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4  = 2;
-    constexpr ck::index_t d3  = 2;
-    constexpr ck::index_t d2  = 3;
-    constexpr ck::index_t d1  = 4;
-    constexpr ck::index_t d0  = 3;
-    const auto layout_runtime = ck::wrapper::make_layout(
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0)));
-    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
-
-    EXPECT_EQ(ck::wrapper::depth(layout_runtime), 3);
-    EXPECT_EQ(ck::wrapper::depth(layout_compiletime), 3);
-    EXPECT_EQ(ck::wrapper::depth(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
-    // Check for integer
-    EXPECT_EQ(ck::wrapper::depth(d0), 0);
-
-    EXPECT_EQ(ck::wrapper::rank(layout_runtime), 2);
-    EXPECT_EQ(ck::wrapper::rank(layout_compiletime), 2);
-    EXPECT_EQ(ck::wrapper::rank(ck::make_tuple(ck::make_tuple(d4, d3), d2)), 2);
-    // Check for integer
-    EXPECT_EQ(ck::wrapper::rank(d0), 1);
-}
-
-TEST(TestLayoutHelpers, ShapeAndStrides)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4     = 2;
-    constexpr ck::index_t d3     = 2;
-    constexpr ck::index_t d2     = 3;
-    constexpr ck::index_t d1     = 4;
-    constexpr ck::index_t d0     = 3;
-    constexpr ck::index_t s4     = 2;
-    constexpr ck::index_t s3     = 4;
-    constexpr ck::index_t s2     = 8;
-    constexpr ck::index_t s1     = 96;
-    constexpr ck::index_t s0     = 24;
-    const auto shape_compiletime = ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
-    const auto strides_compiletime = ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<s4>{}, ck::Number<s3>{}), ck::Number<s2>{}),
-        ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
-    const auto shape_runtime =
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
-    const auto strides_runtime =
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(s4, s3), s2), ck::make_tuple(s1, s0));
-    const auto layout_runtime = ck::wrapper::make_layout(shape_runtime, strides_runtime);
-    const auto layout_compiletime =
-        ck::wrapper::make_layout(shape_compiletime, strides_compiletime);
-
-    constexpr bool check_compiletime_shape =
-        std::is_same_v<decltype(shape_compiletime),
-                       std::remove_reference_t<decltype(shape(layout_compiletime))>>;
-    constexpr bool check_runtime_shape =
-        std::is_same_v<decltype(shape_runtime),
-                       std::remove_reference_t<decltype(shape(layout_runtime))>>;
-    EXPECT_TRUE(check_compiletime_shape);
-    EXPECT_TRUE(check_runtime_shape);
-}
-
-TEST(TestLayoutHelpers, Hierarchical)
-{
-    // dims:(((2, 2), 3), (4, 3))
-    constexpr ck::index_t d4 = 2;
-    constexpr ck::index_t d3 = 2;
-    constexpr ck::index_t d2 = 3;
-    constexpr ck::index_t d1 = 4;
-    constexpr ck::index_t d0 = 3;
-    const auto runtime_shape =
-        ck::make_tuple(ck::make_tuple(ck::make_tuple(d4, d3), d2), ck::make_tuple(d1, d0));
-    const auto layout_runtime     = ck::wrapper::make_layout(runtime_shape);
-    const auto layout_compiletime = ck::wrapper::make_layout(ck::make_tuple(
-        ck::make_tuple(ck::make_tuple(ck::Number<d4>{}, ck::Number<d3>{}), ck::Number<d2>{}),
-        ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{})));
-
-    EXPECT_EQ((ck::wrapper::rank<0, 0>(runtime_shape)), 2);
-    EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_runtime)), 2);
-    EXPECT_EQ((ck::wrapper::rank<0, 0>(layout_compiletime)), 2);
-
-    EXPECT_EQ((ck::wrapper::depth<0, 0>(runtime_shape)), 1);
-    EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_runtime)), 1);
-    EXPECT_EQ((ck::wrapper::depth<0, 0>(layout_compiletime)), 1);
-
-    EXPECT_EQ((ck::wrapper::size<0, 0>(runtime_shape)), d4 * d3);
-    EXPECT_EQ((ck::wrapper::size<0, 0>(layout_runtime)), d4 * d3);
-    EXPECT_EQ((ck::wrapper::size<0, 0>(layout_compiletime)), d4 * d3);
-
-    EXPECT_EQ((ck::wrapper::get<0, 0, 0>(runtime_shape)), d4);
-}
--- a/test/wrapper/test_wrapper_partition.cpp
+++ b/test/wrapper/test_wrapper_partition.cpp
@@ -1,115 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <numeric>
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/host_utility/kernel_launch.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/utility/common_header.hpp"
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-
-TEST(TestPartition, LocalPartition)
-{
-    const auto shape =
-        ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
-    const auto strides =
-        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
-    const auto layout = ck::wrapper::make_layout(shape, strides);
-
-    std::vector<ck::index_t> data(ck::wrapper::size(layout));
-    std::iota(data.begin(), data.end(), 0);
-
-    const auto tensor =
-        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
-
-    const auto thread_steps = ck::make_tuple(ck::Number<1>{}, ck::Number<8>{}, ck::Number<1>{});
-    // row-major thread layout
-    const auto thread_layout =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}, ck::Number<1>{}),
-                                 ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}, ck::Number<1>{}));
-    // 3d partition on 2d shape (calculate partition on 3d thread layout, and then skip first dim)
-    const auto thread_projection =
-        ck::make_tuple(ck::wrapper::slice(4), ck::Number<1>{}, ck::Number<1>{});
-    constexpr ck::index_t projection_thread_length = ck::Number<4>{};
-
-    for(ck::index_t thread_id = 0;
-        thread_id < ck::wrapper::size(thread_layout) / projection_thread_length;
-        thread_id++)
-    {
-        const auto packed_partition =
-            ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_projection);
-
-        const auto expected_partition_size =
-            ck::wrapper::size(tensor) /
-            (ck::wrapper::size(thread_layout) / projection_thread_length);
-        const auto expected_partition_first_val  = thread_id * ck::wrapper::size<1>(thread_steps);
-        const auto expected_partition_second_val = expected_partition_first_val + 1;
-        EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
-        EXPECT_EQ(packed_partition(0), expected_partition_first_val);
-        EXPECT_EQ(packed_partition(1), expected_partition_second_val);
-    }
-}
-
-TEST(TestPartition, LocalTile)
-{
-    const auto shape   = ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}, ck::Number<4>{});
-    const auto strides = ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}, ck::Number<64>{});
-    const auto layout  = ck::wrapper::make_layout(shape, strides);
-
-    std::vector<ck::index_t> data(ck::wrapper::size(layout));
-    std::iota(data.begin(), data.end(), 0);
-
-    const auto tensor =
-        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
-    // 4d tile partitioning on 3d shape (calculate tile on 4d tile layout, and then skip last dim)
-    const auto block_shape =
-        ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}, ck::Number<2>{}, ck::Number<2>{});
-    const auto block_projection =
-        ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(2));
-
-    const auto grid_shape =
-        ck::make_tuple(ck::wrapper::size<0>(shape) / ck::wrapper::size<0>(block_shape),
-                       ck::wrapper::size<1>(shape) / ck::wrapper::size<1>(block_shape),
-                       ck::wrapper::size<2>(shape) / ck::wrapper::size<2>(block_shape));
-    std::vector<ck::Tuple<ck::index_t, ck::index_t, ck::index_t, ck::index_t>> block_idxs;
-    for(int i = 0; i < ck::wrapper::size<0>(grid_shape); i++)
-    {
-        for(int j = 0; j < ck::wrapper::size<1>(grid_shape); j++)
-        {
-            for(int k = 0; k < ck::wrapper::size<2>(grid_shape); k++)
-            {
-                block_idxs.emplace_back(i, j, k, 0);
-            }
-        }
-    }
-
-    for(auto block_idx : block_idxs)
-    {
-        constexpr ck::index_t projection_block_dim = ck::Number<2>{};
-        const auto packed_tile =
-            ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_projection);
-
-        const auto expected_tile_size = ck::wrapper::size(block_shape) / projection_block_dim;
-        auto expected_tile_first_val  = ck::wrapper::size<2>(block_idx) *
-                                       ck::wrapper::size<2>(block_shape) *
-                                       ck::wrapper::size<2>(strides);
-        expected_tile_first_val += ck::wrapper::size<1>(block_idx) *
-                                   ck::wrapper::size<1>(block_shape) *
-                                   ck::wrapper::size<1>(strides);
-        expected_tile_first_val += ck::wrapper::size<0>(block_idx) *
-                                   ck::wrapper::size<0>(block_shape) *
-                                   ck::wrapper::size<0>(strides);
-
-        const auto expected_tile_second_val = expected_tile_first_val + 1;
-        EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
-        EXPECT_EQ(packed_tile(0), expected_tile_first_val);
-        EXPECT_EQ(packed_tile(1), expected_tile_second_val);
-    }
-}
--- a/test/wrapper/test_wrapper_tensor.cpp
+++ b/test/wrapper/test_wrapper_tensor.cpp
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include <cstdlib>
-#include <iostream>
-#include <initializer_list>
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "ck/library/utility/device_memory.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-
-#include "ck/utility/common_header.hpp"
-
-#include "ck/wrapper/layout.hpp"
-#include "ck/wrapper/tensor.hpp"
-
-// Compare data in tensor with offset from layout.
-// Data and offset should match if physical memory has been initialized with
-// sequentially increasing values from 0.
-template <typename TensorType>
-__host__ __device__ bool TestTensorCheck3d(TensorType& tensor)
-{
-    const auto& layout = ck::wrapper::layout(tensor);
-    for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
-    {
-        for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
-        {
-            for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
-            {
-                const auto idx = ck::make_tuple(ck::make_tuple(d, h), w);
-                if(tensor(idx) != layout(idx))
-                {
-                    return false;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-template <typename TensorType>
-__host__ __device__ bool TestTensorCheck1d(TensorType& tensor, ck::index_t start_offset = 0)
-{
-    const auto& layout = ck::wrapper::layout(tensor);
-    for(ck::index_t w = 0; w < ck::wrapper::size<0>(layout); w++)
-    {
-        if(tensor(w) - start_offset != layout(ck::make_tuple(w)))
-        {
-            return false;
-        }
-    }
-    return true;
-}
-
-template <ck::index_t nelems, typename TensorType>
-__host__ __device__ bool StaticTestTensorCheck1d(TensorType& tensor)
-{
-    const auto& layout = ck::wrapper::layout(tensor);
-    bool success       = true;
-    ck::static_for<0, nelems, 1>{}([&](auto w) {
-        if(tensor(ck::Number<w.value>{}) != layout(ck::make_tuple(w.value)))
-        {
-            success = false;
-        }
-    });
-    return success;
-}
-
-template <typename TensorType>
-__host__ __device__ void InitTensor(TensorType& tensor)
-{
-    for(ck::index_t i = 0; i < ck::wrapper::size(ck::wrapper::layout(tensor)); i++)
-    {
-        tensor(i) = i;
-    }
-}
-
-template <ck::index_t nelems, typename TensorType>
-__host__ __device__ void StaticInitTensor(TensorType& tensor)
-{
-
-    ck::static_for<0, nelems, 1>{}([&](auto i) { tensor(ck::Number<i.value>{}) = i.value; });
-}
-
-// Tests
-TEST(TestTensor, ReadWriteHostMemory)
-{
-    constexpr ck::index_t nelems = 8;
-
-    std::array<ck::index_t, nelems> data;
-    const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
-    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
-    InitTensor(tensor);
-
-    EXPECT_TRUE(TestTensorCheck1d(tensor));
-    EXPECT_TRUE(TestTensorCheck3d(tensor));
-}
-
-__global__ void TestTensorReadWriteDevice(void* data, void* success)
-{
-    constexpr ck::index_t nelems = 8;
-    __shared__ ck::index_t p_shared[nelems];
-
-    ck::index_t* casted_data_ptr = static_cast<ck::index_t*>(data);
-    bool* casted_success_ptr     = static_cast<bool*>(success);
-
-    const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
-    constexpr auto vgpr_layout =
-        ck::wrapper::make_layout(make_tuple(ck::Number<nelems>{}), make_tuple(ck::Number<1>{}));
-
-    auto tensor_global =
-        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
-    auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(p_shared, layout);
-    auto tensor_vgpr =
-        ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr, ck::index_t>(
-            vgpr_layout);
-
-    InitTensor(tensor_global);
-    InitTensor(tensor_lds);
-    StaticInitTensor<nelems>(tensor_vgpr);
-
-    *casted_success_ptr = TestTensorCheck1d(tensor_global);
-    *casted_success_ptr &= TestTensorCheck3d(tensor_global);
-
-    *casted_success_ptr &= TestTensorCheck1d(tensor_lds);
-    *casted_success_ptr &= TestTensorCheck3d(tensor_lds);
-
-    *casted_success_ptr &= StaticTestTensorCheck1d<nelems>(tensor_vgpr);
-}
-
-TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
-{
-    constexpr ck::index_t nelems = 8;
-    std::array<ck::index_t, nelems> host_data;
-
-    DeviceMem data_buf(nelems * sizeof(ck::index_t));
-    data_buf.ToDevice(&host_data[0]);
-    DeviceMem success_buf(sizeof(bool));
-
-    launch_and_time_kernel(StreamConfig{},
-                           TestTensorReadWriteDevice,
-                           dim3(1),
-                           dim3(1),
-                           0,
-                           data_buf.GetDeviceBuffer(),
-                           success_buf.GetDeviceBuffer());
-
-    bool success;
-    success_buf.FromDevice(&success);
-    EXPECT_TRUE(success);
-}
-
-TEST(TestTensor, Slicing)
-{
-    constexpr ck::index_t nelems = 8;
-
-    std::array<ck::index_t, nelems> data;
-    const auto shape   = ck::make_tuple(ck::make_tuple(2, 2), 2);
-    const auto strides = ck::make_tuple(ck::make_tuple(1, 2), 4);
-    const auto layout  = ck::wrapper::make_layout(shape, strides);
-    auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
-    InitTensor(tensor);
-
-    auto tensor2x2x2 =
-        tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
-    EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
-    EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
-    EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
-
-    auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
-    EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
-    EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
-    EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
-
-    auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
-    EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
-    EXPECT_EQ(rank(tensor1x1), 2);
-    EXPECT_EQ(depth(tensor1x1), 2);
-    EXPECT_EQ(size(tensor1x1), 1);
-    EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
-
-    auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
-    EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
-    EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
-    EXPECT_EQ(ck::wrapper::size(tensor2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2));
-
-    auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
-    EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
-    EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
-    EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
-    EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
-
-    // negative indexing
-    auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
-    EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
-    EXPECT_EQ(rank(tensor1x2), 2);
-    EXPECT_EQ(depth(tensor1x2), 2);
-    EXPECT_EQ(size(tensor1x2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
-}